/* * Write a TRUNCATE xlog record */ static void WriteTruncateXlogRec(int pageno) { XLogBeginInsert(); XLogRegisterData((char *) (&pageno), sizeof(int)); (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE); }
/* * Write XLOG record describing a delete of leaf index tuples marked as DEAD * during new tuple insertion. One may think that this case is already covered * by gistXLogUpdate(). But deletion of index tuples might conflict with * standby queries and needs special handling. */ XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete, RelFileNode hnode) { gistxlogDelete xlrec; XLogRecPtr recptr; xlrec.hnode = hnode; xlrec.ntodelete = ntodelete; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfGistxlogDelete); /* * We need the target-offsets array whether or not we store the whole * buffer, to allow us to find the latestRemovedXid on a standby server. */ XLogRegisterData((char *) todelete, ntodelete * sizeof(OffsetNumber)); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_DELETE); return recptr; }
/* * Write XLOG record describing a page update. The update can include any * number of deletions and/or insertions of tuples on a single index page. * * If this update inserts a downlink for a split page, also record that * the F_FOLLOW_RIGHT flag on the child page is cleared and NSN set. * * Note that both the todelete array and the tuples are marked as belonging * to the target buffer; they need not be stored in XLOG if XLogInsert decides * to log the whole buffer contents instead. */ XLogRecPtr gistXLogUpdate(Buffer buffer, OffsetNumber *todelete, int ntodelete, IndexTuple *itup, int ituplen, Buffer leftchildbuf) { gistxlogPageUpdate xlrec; int i; XLogRecPtr recptr; xlrec.ntodelete = ntodelete; xlrec.ntoinsert = ituplen; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageUpdate)); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) todelete, sizeof(OffsetNumber) * ntodelete); /* new tuples */ for (i = 0; i < ituplen; i++) XLogRegisterBufData(0, (char *) (itup[i]), IndexTupleSize(itup[i])); /* * Include a full page image of the child buf. (only necessary if a * checkpoint happened since the child page was split) */ if (BufferIsValid(leftchildbuf)) XLogRegisterBuffer(1, leftchildbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE); return recptr; }
/* * Remove the visibility map fork for a relation. If there turn out to be * any bugs in the visibility map code that require rebuilding the VM, this * provides users with a way to do it that is cleaner than shutting down the * server and removing files by hand. * * This is a cut-down version of RelationTruncate. */ Datum pg_truncate_visibility_map(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); Relation rel; rel = relation_open(relid, AccessExclusiveLock); if (rel->rd_rel->relkind != RELKIND_RELATION && rel->rd_rel->relkind != RELKIND_MATVIEW && rel->rd_rel->relkind != RELKIND_TOASTVALUE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table, materialized view, or TOAST table", RelationGetRelationName(rel)))); RelationOpenSmgr(rel); rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber; visibilitymap_truncate(rel, 0); if (RelationNeedsWAL(rel)) { xl_smgr_truncate xlrec; xlrec.blkno = 0; xlrec.rnode = rel->rd_node; xlrec.flags = SMGR_TRUNCATE_VM; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); } /* * Release the lock right away, not at commit time. * * It would be a problem to release the lock prior to commit if this * truncate operation sends any transactional invalidation messages. Other * backends would potentially be able to lock the relation without * processing them in the window of time between when we release the lock * here and when we sent the messages at our eventual commit. However, * we're currently only sending a non-transactional smgr invalidation, * which will have been posted to shared memory immediately from within * visibilitymap_truncate. Therefore, there should be no race here. * * The reason why it's desirable to release the lock early here is because * of the possibility that someone will need to use this to blow away many * visibility map forks at once. If we can't release the lock until * commit time, the transaction doing this will accumulate * AccessExclusiveLocks on all of those relations at the same time, which * is undesirable. However, if this turns out to be unsafe we may have no * choice... */ relation_close(rel, AccessExclusiveLock); /* Nothing to return. */ PG_RETURN_VOID(); }
/* * Write a ZEROPAGE xlog record */ static void WriteZeroPageXlogRec(int pageno) { XLogBeginInsert(); XLogRegisterData((char *) (&pageno), sizeof(int)); (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); }
/* * Write logical decoding message into XLog. */ XLogRecPtr LogLogicalMessage(const char *prefix, const char *message, size_t size, bool transactional) { xl_logical_message xlrec; /* * Force xid to be allocated if we're emitting a transactional message. */ if (transactional) { Assert(IsTransactionState()); GetCurrentTransactionId(); } xlrec.dbId = MyDatabaseId; xlrec.transactional = transactional; xlrec.prefix_size = strlen(prefix) + 1; xlrec.message_size = size; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfLogicalMessage); XLogRegisterData((char *) prefix, xlrec.prefix_size); XLogRegisterData((char *) message, size); /* allow origin filtering */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); return XLogInsert(RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE); }
/* * Write a WAL record containing a full image of a page. Caller is responsible * for writing the page to disk after calling this routine. * * Note: If you're using this function, you should be building pages in private * memory and writing them directly to smgr. If you're using buffers, call * log_newpage_buffer instead. * * If the page follows the standard page layout, with a PageHeader and unused * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows * the unused space to be left out from the WAL record, making it smaller. */ XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std) { int flags; XLogRecPtr recptr; flags = REGBUF_FORCE_IMAGE; if (page_std) flags |= REGBUF_STANDARD; XLogBeginInsert(); XLogRegisterBlock(0, rnode, forkNum, blkno, page, flags); recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); /* * The page may be uninitialized. If so, we can't set the LSN because that * would corrupt the page. */ if (!PageIsNew(page)) { PageSetLSN(page, recptr); } return recptr; }
/* * Write a TRUNCATE xlog record */ static void WriteTruncateXlogRec(int pageno, TransactionId oldestXid) { xl_commit_ts_truncate xlrec; xlrec.pageno = pageno; xlrec.oldestXid = oldestXid; XLogBeginInsert(); XLogRegisterData((char *) (&xlrec), SizeOfCommitTsTruncate); (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE); }
/* * Write WAL record of a page split. */ XLogRecPtr gistXLogSplit(bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN orignsn, Buffer leftchildbuf, bool markfollowright) { gistxlogPageSplit xlrec; SplitedPageLayout *ptr; int npage = 0; XLogRecPtr recptr; int i; for (ptr = dist; ptr; ptr = ptr->next) npage++; xlrec.origrlink = origrlink; xlrec.orignsn = orignsn; xlrec.origleaf = page_is_leaf; xlrec.npage = (uint16) npage; xlrec.markfollowright = markfollowright; XLogBeginInsert(); /* * Include a full page image of the child buf. (only necessary if a * checkpoint happened since the child page was split) */ if (BufferIsValid(leftchildbuf)) XLogRegisterBuffer(0, leftchildbuf, REGBUF_STANDARD); /* * NOTE: We register a lot of data. The caller must've called * XLogEnsureRecordSpace() to prepare for that. We cannot do it here, * because we're already in a critical section. If you change the number * of buffer or data registrations here, make sure you modify the * XLogEnsureRecordSpace() calls accordingly! */ XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageSplit)); i = 1; for (ptr = dist; ptr; ptr = ptr->next) { XLogRegisterBuffer(i, ptr->buffer, REGBUF_WILL_INIT); XLogRegisterBufData(i, (char *) &(ptr->block.num), sizeof(int)); XLogRegisterBufData(i, (char *) ptr->list, ptr->lenlist); i++; } recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT); return recptr; }
/* * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL. */ void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum) { xl_smgr_create xlrec; /* * Make an XLOG entry reporting the file creation. */ xlrec.rnode = *rnode; xlrec.forkNum = forkNum; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); }
/* * Place tuple on page and fills WAL record * * If the tuple doesn't fit, returns false without modifying the page. * * On insertion to an internal node, in addition to inserting the given item, * the downlink of the existing item at 'off' is updated to point to * 'updateblkno'. * * On INSERTED, registers the buffer as buffer ID 0, with data. * On SPLIT, returns rdata that represents the split pages in *prdata. */ static GinPlaceToPageRC entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertPayload, BlockNumber updateblkno, Page *newlpage, Page *newrpage) { GinBtreeEntryInsertData *insertData = insertPayload; Page page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST); OffsetNumber off = stack->off; OffsetNumber placed; /* this must be static so it can be returned to caller. */ static ginxlogInsertEntry data; /* quick exit if it doesn't fit */ if (!entryIsEnoughSpace(btree, buf, off, insertData)) { entrySplitPage(btree, buf, stack, insertPayload, updateblkno, newlpage, newrpage); return SPLIT; } START_CRIT_SECTION(); entryPreparePage(btree, page, off, insertData, updateblkno); placed = PageAddItem(page, (Item) insertData->entry, IndexTupleSize(insertData->entry), off, false, false); if (placed != off) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(btree->index)); if (RelationNeedsWAL(btree->index)) { data.isDelete = insertData->isDelete; data.offset = off; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) &data, offsetof(ginxlogInsertEntry, tuple)); XLogRegisterBufData(0, (char *) insertData->entry, IndexTupleSize(insertData->entry)); } return INSERTED; }
/* * log_split_page() -- Log the split operation * * We log the split operation when the new page in new bucket gets full, * so we log the entire page. * * 'buf' must be locked by the caller which is also responsible for unlocking * it. */ static void log_split_page(Relation rel, Buffer buf) { if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE); PageSetLSN(BufferGetPage(buf), recptr); } }
/* * Write a SETTS xlog record */ static void WriteSetTimestampXlogRec(TransactionId mainxid, int nsubxids, TransactionId *subxids, TimestampTz timestamp, RepOriginId nodeid) { xl_commit_ts_set record; record.timestamp = timestamp; record.nodeid = nodeid; record.mainxid = mainxid; XLogBeginInsert(); XLogRegisterData((char *) &record, offsetof(xl_commit_ts_set, mainxid) + sizeof(TransactionId)); XLogRegisterData((char *) subxids, nsubxids * sizeof(TransactionId)); XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_SETTS); }
/* * Write the given statistics to the index's metapage * * Note: nPendingPages and ginVersion are *not* copied over */ void ginUpdateStats(Relation index, const GinStatsData *stats) { Buffer metabuffer; Page metapage; GinMetaPageData *metadata; metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); LockBuffer(metabuffer, GIN_EXCLUSIVE); metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); START_CRIT_SECTION(); metadata->nTotalPages = stats->nTotalPages; metadata->nEntryPages = stats->nEntryPages; metadata->nDataPages = stats->nDataPages; metadata->nEntries = stats->nEntries; MarkBufferDirty(metabuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; ginxlogUpdateMeta data; data.node = index->rd_node; data.ntuples = 0; data.newRightlink = data.prevTail = InvalidBlockNumber; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogBeginInsert(); XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); PageSetLSN(metapage, recptr); } UnlockReleaseBuffer(metabuffer); END_CRIT_SECTION(); }
/* * Create a WAL record for vacuuming entry tree leaf page. */ static void xlogVacuumPage(Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); XLogRecPtr recptr; /* This is only used for entry tree leaf pages. */ Assert(!GinPageIsData(page)); Assert(GinPageIsLeaf(page)); if (!RelationNeedsWAL(index)) return; /* * Always create a full image, we don't track the changes on the page at * any more fine-grained level. This could obviously be improved... */ XLogBeginInsert(); XLogRegisterBuffer(0, buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_PAGE); PageSetLSN(page, recptr); }
/* * Create a table space * * Only superusers can create a tablespace. This seems a reasonable restriction * since we're determining the system layout and, anyway, we probably have * root if we're doing this kind of activity */ Oid CreateTableSpace(CreateTableSpaceStmt *stmt) { #ifdef HAVE_SYMLINK Relation rel; Datum values[Natts_pg_tablespace]; bool nulls[Natts_pg_tablespace]; HeapTuple tuple; Oid tablespaceoid; char *location; Oid ownerId; Datum newOptions; /* Must be super user */ if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied to create tablespace \"%s\"", stmt->tablespacename), errhint("Must be superuser to create a tablespace."))); /* However, the eventual owner of the tablespace need not be */ if (stmt->owner) ownerId = get_rolespec_oid(stmt->owner, false); else ownerId = GetUserId(); /* Unix-ify the offered path, and strip any trailing slashes */ location = pstrdup(stmt->location); canonicalize_path(location); /* disallow quotes, else CREATE DATABASE would be at risk */ if (strchr(location, '\'')) ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), errmsg("tablespace location cannot contain single quotes"))); /* * Allowing relative paths seems risky * * this also helps us ensure that location is not empty or whitespace */ if (!is_absolute_path(location)) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location must be an absolute path"))); /* * Check that location isn't too long. Remember that we're going to append * 'PG_XXX/<dboid>/<relid>_<fork>.<nnn>'. FYI, we never actually * reference the whole path here, but mkdir() uses the first two parts. */ if (strlen(location) + 1 + strlen(TABLESPACE_VERSION_DIRECTORY) + 1 + OIDCHARS + 1 + OIDCHARS + 1 + FORKNAMECHARS + 1 + OIDCHARS > MAXPGPATH) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location \"%s\" is too long", location))); /* Warn if the tablespace is in the data directory. */ if (path_is_prefix_of_path(DataDir, location)) ereport(WARNING, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location should not be inside the data directory"))); /* * Disallow creation of tablespaces named "pg_xxx"; we reserve this * namespace for system purposes. */ if (!allowSystemTableMods && IsReservedName(stmt->tablespacename)) ereport(ERROR, (errcode(ERRCODE_RESERVED_NAME), errmsg("unacceptable tablespace name \"%s\"", stmt->tablespacename), errdetail("The prefix \"pg_\" is reserved for system tablespaces."))); /* * Check that there is no other tablespace by this name. (The unique * index would catch this anyway, but might as well give a friendlier * message.) */ if (OidIsValid(get_tablespace_oid(stmt->tablespacename, true))) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("tablespace \"%s\" already exists", stmt->tablespacename))); /* * Insert tuple into pg_tablespace. The purpose of doing this first is to * lock the proposed tablename against other would-be creators. The * insertion will roll back if we find problems below. */ rel = heap_open(TableSpaceRelationId, RowExclusiveLock); MemSet(nulls, false, sizeof(nulls)); values[Anum_pg_tablespace_spcname - 1] = DirectFunctionCall1(namein, CStringGetDatum(stmt->tablespacename)); values[Anum_pg_tablespace_spcowner - 1] = ObjectIdGetDatum(ownerId); nulls[Anum_pg_tablespace_spcacl - 1] = true; /* Generate new proposed spcoptions (text array) */ newOptions = transformRelOptions((Datum) 0, stmt->options, NULL, NULL, false, false); (void) tablespace_reloptions(newOptions, true); if (newOptions != (Datum) 0) values[Anum_pg_tablespace_spcoptions - 1] = newOptions; else nulls[Anum_pg_tablespace_spcoptions - 1] = true; tuple = heap_form_tuple(rel->rd_att, values, nulls); tablespaceoid = simple_heap_insert(rel, tuple); CatalogUpdateIndexes(rel, tuple); heap_freetuple(tuple); /* Record dependency on owner */ recordDependencyOnOwner(TableSpaceRelationId, tablespaceoid, ownerId); /* Post creation hook for new tablespace */ InvokeObjectPostCreateHook(TableSpaceRelationId, tablespaceoid, 0); create_tablespace_directories(location, tablespaceoid); /* Record the filesystem change in XLOG */ { xl_tblspc_create_rec xlrec; xlrec.ts_id = tablespaceoid; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, offsetof(xl_tblspc_create_rec, ts_path)); XLogRegisterData((char *) location, strlen(location) + 1); (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_CREATE); } /* * Force synchronous commit, to minimize the window between creating the * symlink on-disk and marking the transaction committed. It's not great * that there is any window at all, but definitely we don't want to make * it larger than necessary. */ ForceSyncCommit(); pfree(location); /* We keep the lock on pg_tablespace until commit */ heap_close(rel, NoLock); return tablespaceoid; #else /* !HAVE_SYMLINK */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); return InvalidOid; /* keep compiler quiet */ #endif /* HAVE_SYMLINK */ }
/* * Write out a new shared or local map file with the given contents. * * The magic number and CRC are automatically updated in *newmap. On * success, we copy the data to the appropriate permanent static variable. * * If write_wal is TRUE then an appropriate WAL message is emitted. * (It will be false for bootstrap and WAL replay cases.) * * If send_sinval is TRUE then a SI invalidation message is sent. * (This should be true except in bootstrap case.) * * If preserve_files is TRUE then the storage manager is warned not to * delete the files listed in the map. * * Because this may be called during WAL replay when MyDatabaseId, * DatabasePath, etc aren't valid, we require the caller to pass in suitable * values. The caller is also responsible for being sure no concurrent * map update could be happening. */ static void write_relmap_file(bool shared, RelMapFile *newmap, bool write_wal, bool send_sinval, bool preserve_files, Oid dbid, Oid tsid, const char *dbpath) { int fd; RelMapFile *realmap; char mapfilename[MAXPGPATH]; /* * Fill in the overhead fields and update CRC. */ newmap->magic = RELMAPPER_FILEMAGIC; if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS) elog(ERROR, "attempt to write bogus relation mapping"); INIT_CRC32C(newmap->crc); COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc)); FIN_CRC32C(newmap->crc); /* * Open the target file. We prefer to do this before entering the * critical section, so that an open() failure need not force PANIC. */ if (shared) { snprintf(mapfilename, sizeof(mapfilename), "global/%s", RELMAPPER_FILENAME); realmap = &shared_map; } else { snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath, RELMAPPER_FILENAME); realmap = &local_map; } fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open relation mapping file \"%s\": %m", mapfilename))); if (write_wal) { xl_relmap_update xlrec; XLogRecPtr lsn; /* now errors are fatal ... */ START_CRIT_SECTION(); xlrec.dbid = dbid; xlrec.tsid = tsid; xlrec.nbytes = sizeof(RelMapFile); XLogBeginInsert(); XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate); XLogRegisterData((char *) newmap, sizeof(RelMapFile)); lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE); /* As always, WAL must hit the disk before the data update does */ XLogFlush(lsn); } errno = 0; if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile)) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to relation mapping file \"%s\": %m", mapfilename))); } /* * We choose to fsync the data to disk before considering the task done. * It would be possible to relax this if it turns out to be a performance * issue, but it would complicate checkpointing --- see notes for * CheckPointRelationMap. */ if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync relation mapping file \"%s\": %m", mapfilename))); if (CloseTransientFile(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close relation mapping file \"%s\": %m", mapfilename))); /* * Now that the file is safely on disk, send sinval message to let other * backends know to re-read it. We must do this inside the critical * section: if for some reason we fail to send the message, we have to * force a database-wide PANIC. Otherwise other backends might continue * execution with stale mapping information, which would be catastrophic * as soon as others began to use the now-committed data. */ if (send_sinval) CacheInvalidateRelmap(dbid); /* * Make sure that the files listed in the map are not deleted if the outer * transaction aborts. This had better be within the critical section * too: it's not likely to fail, but if it did, we'd arrive at transaction * abort with the files still vulnerable. PANICing will leave things in a * good state on-disk. * * Note: we're cheating a little bit here by assuming that mapped files * are either in pg_global or the database's default tablespace. */ if (preserve_files) { int32 i; for (i = 0; i < newmap->num_mappings; i++) { RelFileNode rnode; rnode.spcNode = tsid; rnode.dbNode = dbid; rnode.relNode = newmap->mappings[i].mapfilenode; RelationPreserveStorage(rnode, false); } } /* Success, update permanent copy */ memcpy(realmap, newmap, sizeof(RelMapFile)); /* Critical section done */ if (write_wal) END_CRIT_SECTION(); }
/* * Vacuum a regular (non-root) leaf page * * We must delete tuples that are targeted for deletion by the VACUUM, * but not move any tuples that are referenced by outside links; we assume * those are the ones that are heads of chains. * * If we find a REDIRECT that was made by a concurrently-running transaction, * we must add its target TID to pendingList. (We don't try to visit the * target immediately, first because we don't want VACUUM locking more than * one buffer at a time, and second because the duplicate-filtering logic * in spgAddPendingTID is useful to ensure we can't get caught in an infinite * loop in the face of continuous concurrent insertions.) * * If forPending is true, we are examining the page as a consequence of * chasing a redirect link, not as part of the normal sequential scan. * We still vacuum the page normally, but we don't increment the stats * about live tuples; else we'd double-count those tuples, since the page * has been or will be visited in the sequential scan as well. */ static void vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, bool forPending) { Page page = BufferGetPage(buffer); spgxlogVacuumLeaf xlrec; OffsetNumber toDead[MaxIndexTuplesPerPage]; OffsetNumber toPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber moveSrc[MaxIndexTuplesPerPage]; OffsetNumber moveDest[MaxIndexTuplesPerPage]; OffsetNumber chainSrc[MaxIndexTuplesPerPage]; OffsetNumber chainDest[MaxIndexTuplesPerPage]; OffsetNumber predecessor[MaxIndexTuplesPerPage + 1]; bool deletable[MaxIndexTuplesPerPage + 1]; int nDeletable; OffsetNumber i, max = PageGetMaxOffsetNumber(page); memset(predecessor, 0, sizeof(predecessor)); memset(deletable, 0, sizeof(deletable)); nDeletable = 0; /* Scan page, identify tuples to delete, accumulate stats */ for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (lt->tupstate == SPGIST_LIVE) { Assert(ItemPointerIsValid(<->heapPtr)); if (bds->callback(<->heapPtr, bds->callback_state)) { bds->stats->tuples_removed += 1; deletable[i] = true; nDeletable++; } else { if (!forPending) bds->stats->num_index_tuples += 1; } /* Form predecessor map, too */ if (lt->nextOffset != InvalidOffsetNumber) { /* paranoia about corrupted chain links */ if (lt->nextOffset < FirstOffsetNumber || lt->nextOffset > max || predecessor[lt->nextOffset] != InvalidOffsetNumber) elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"", BufferGetBlockNumber(buffer), RelationGetRelationName(index)); predecessor[lt->nextOffset] = i; } } else if (lt->tupstate == SPGIST_REDIRECT) { SpGistDeadTuple dt = (SpGistDeadTuple) lt; Assert(dt->nextOffset == InvalidOffsetNumber); Assert(ItemPointerIsValid(&dt->pointer)); /* * Add target TID to pending list if the redirection could have * happened since VACUUM started. * * Note: we could make a tighter test by seeing if the xid is * "running" according to the active snapshot; but snapmgr.c doesn't * currently export a suitable API, and it's not entirely clear * that a tighter test is worth the cycles anyway. */ if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) spgAddPendingTID(bds, &dt->pointer); } else { Assert(lt->nextOffset == InvalidOffsetNumber); } } if (nDeletable == 0) return; /* nothing more to do */ /*---------- * Figure out exactly what we have to do. We do this separately from * actually modifying the page, mainly so that we have a representation * that can be dumped into WAL and then the replay code can do exactly * the same thing. The output of this step consists of six arrays * describing four kinds of operations, to be performed in this order: * * toDead[]: tuple numbers to be replaced with DEAD tuples * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples * moveSrc[]: tuple numbers that need to be relocated to another offset * (replacing the tuple there) and then replaced with PLACEHOLDER tuples * moveDest[]: new locations for moveSrc tuples * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates * chainDest[]: new values of nextOffset for chainSrc members * * It's easiest to figure out what we have to do by processing tuple * chains, so we iterate over all the tuples (not just the deletable * ones!) to identify chain heads, then chase down each chain and make * work item entries for deletable tuples within the chain. *---------- */ xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0; for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple head; bool interveningDeletable; OffsetNumber prevLive; OffsetNumber j; head = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (head->tupstate != SPGIST_LIVE) continue; /* can't be a chain member */ if (predecessor[i] != 0) continue; /* not a chain head */ /* initialize ... */ interveningDeletable = false; prevLive = deletable[i] ? InvalidOffsetNumber : i; /* scan down the chain ... */ j = head->nextOffset; while (j != InvalidOffsetNumber) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, j)); if (lt->tupstate != SPGIST_LIVE) { /* all tuples in chain should be live */ elog(ERROR, "unexpected SPGiST tuple state: %d", lt->tupstate); } if (deletable[j]) { /* This tuple should be replaced by a placeholder */ toPlaceholder[xlrec.nPlaceholder] = j; xlrec.nPlaceholder++; /* previous live tuple's chain link will need an update */ interveningDeletable = true; } else if (prevLive == InvalidOffsetNumber) { /* * This is the first live tuple in the chain. It has to move * to the head position. */ moveSrc[xlrec.nMove] = j; moveDest[xlrec.nMove] = i; xlrec.nMove++; /* Chain updates will be applied after the move */ prevLive = i; interveningDeletable = false; } else { /* * Second or later live tuple. Arrange to re-chain it to the * previous live one, if there was a gap. */ if (interveningDeletable) { chainSrc[xlrec.nChain] = prevLive; chainDest[xlrec.nChain] = j; xlrec.nChain++; } prevLive = j; interveningDeletable = false; } j = lt->nextOffset; } if (prevLive == InvalidOffsetNumber) { /* The chain is entirely removable, so we need a DEAD tuple */ toDead[xlrec.nDead] = i; xlrec.nDead++; } else if (interveningDeletable) { /* One or more deletions at end of chain, so close it off */ chainSrc[xlrec.nChain] = prevLive; chainDest[xlrec.nChain] = InvalidOffsetNumber; xlrec.nChain++; } } /* sanity check ... */ if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove) elog(ERROR, "inconsistent counts of deletable tuples"); /* Do the updates */ START_CRIT_SECTION(); spgPageIndexMultiDelete(&bds->spgstate, page, toDead, xlrec.nDead, SPGIST_DEAD, SPGIST_DEAD, InvalidBlockNumber, InvalidOffsetNumber); spgPageIndexMultiDelete(&bds->spgstate, page, toPlaceholder, xlrec.nPlaceholder, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); /* * We implement the move step by swapping the line pointers of the source * and target tuples, then replacing the newly-source tuples with * placeholders. This is perhaps unduly friendly with the page data * representation, but it's fast and doesn't risk page overflow when a * tuple to be relocated is large. */ for (i = 0; i < xlrec.nMove; i++) { ItemId idSrc = PageGetItemId(page, moveSrc[i]); ItemId idDest = PageGetItemId(page, moveDest[i]); ItemIdData tmp; tmp = *idSrc; *idSrc = *idDest; *idDest = tmp; } spgPageIndexMultiDelete(&bds->spgstate, page, moveSrc, xlrec.nMove, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); for (i = 0; i < xlrec.nChain; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, chainSrc[i])); Assert(lt->tupstate == SPGIST_LIVE); lt->nextOffset = chainDest[i]; } MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); STORE_STATE(&bds->spgstate, xlrec.stateSrc); XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumLeaf); /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ XLogRegisterData((char *) toDead, sizeof(OffsetNumber) * xlrec.nDead); XLogRegisterData((char *) toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder); XLogRegisterData((char *) moveSrc, sizeof(OffsetNumber) * xlrec.nMove); XLogRegisterData((char *) moveDest, sizeof(OffsetNumber) * xlrec.nMove); XLogRegisterData((char *) chainSrc, sizeof(OffsetNumber) * xlrec.nChain); XLogRegisterData((char *) chainDest, sizeof(OffsetNumber) * xlrec.nChain); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/* * Vacuum a root page when it is also a leaf * * On the root, we just delete any dead leaf tuples; no fancy business */ static void vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); spgxlogVacuumRoot xlrec; OffsetNumber toDelete[MaxIndexTuplesPerPage]; OffsetNumber i, max = PageGetMaxOffsetNumber(page); xlrec.nDelete = 0; /* Scan page, identify tuples to delete, accumulate stats */ for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (lt->tupstate == SPGIST_LIVE) { Assert(ItemPointerIsValid(<->heapPtr)); if (bds->callback(<->heapPtr, bds->callback_state)) { bds->stats->tuples_removed += 1; toDelete[xlrec.nDelete] = i; xlrec.nDelete++; } else { bds->stats->num_index_tuples += 1; } } else { /* all tuples on root should be live */ elog(ERROR, "unexpected SPGiST tuple state: %d", lt->tupstate); } } if (xlrec.nDelete == 0) return; /* nothing more to do */ /* Do the update */ START_CRIT_SECTION(); /* The tuple numbers are in order, so we can use PageIndexMultiDelete */ PageIndexMultiDelete(page, toDelete, xlrec.nDelete); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); /* Prepare WAL record */ STORE_STATE(&bds->spgstate, xlrec.stateSrc); XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRoot); /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ XLogRegisterData((char *) toDelete, sizeof(OffsetNumber) * xlrec.nDelete); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/* * Helper function to perform deletion of index entries from a bucket. * * This function expects that the caller has acquired a cleanup lock on the * primary bucket page, and will return with a write lock again held on the * primary bucket page. The lock won't necessarily be held continuously, * though, because we'll release it when visiting overflow pages. * * It would be very bad if this function cleaned a page while some other * backend was in the midst of scanning it, because hashgettuple assumes * that the next valid TID will be greater than or equal to the current * valid TID. There can't be any concurrent scans in progress when we first * enter this function because of the cleanup lock we hold on the primary * bucket page, but as soon as we release that lock, there might be. We * handle that by conspiring to prevent those scans from passing our cleanup * scan. To do that, we lock the next page in the bucket chain before * releasing the lock on the previous page. (This type of lock chaining is * not ideal, so we might want to look for a better solution at some point.) * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. */ void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state) { BlockNumber blkno; Buffer buf; Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; bool bucket_dirty = false; blkno = bucket_blkno; buf = bucket_buf; if (split_cleanup) new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, lowmask, maxbucket); /* Scan each page in bucket */ for (;;) { HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; Buffer next_buf; Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; bool clear_dead_marking = false; vacuum_delay_point(); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { ItemPointer htup; IndexTuple itup; Bucket bucket; bool kill_tuple = false; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ if (callback && callback(htup, callback_state)) { kill_tuple = true; if (tuples_removed) *tuples_removed += 1; } else if (split_cleanup) { /* delete the tuples that are moved by split. */ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); /* mark the item for deletion */ if (bucket != cur_bucket) { /* * We expect tuples to either belong to current bucket or * new_bucket. This is ensured because we don't allow * further splits from bucket that contains garbage. See * comments in _hash_expandtable. */ Assert(bucket == new_bucket); kill_tuple = true; } } if (kill_tuple) { /* mark the item for deletion */ deletable[ndeletable++] = offno; } else { /* we're keeping it, so count it */ if (num_index_tuples) *num_index_tuples += 1; } } /* retain the pin on primary bucket page till end of bucket scan */ if (blkno == bucket_blkno) retain_pin = true; else retain_pin = false; blkno = opaque->hasho_nextblkno; /* * Apply deletions, advance to next page and write page if needed. */ if (ndeletable > 0) { /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; /* * Let us mark the page as clean if vacuum removes the DEAD tuples * from an index page. We do this by clearing * LH_PAGE_HAS_DEAD_TUPLES flag. */ if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) { opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; clear_dead_marking = true; } MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_delete xlrec; XLogRecPtr recptr; xlrec.clear_dead_marking = clear_dead_marking; xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashDelete); /* * bucket buffer needs to be registered to ensure that we can * acquire a cleanup lock on it during replay. */ if (!xlrec.is_primary_bucket_page) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); PageSetLSN(BufferGetPage(buf), recptr); } END_CRIT_SECTION(); } /* bail out if there are no more pages to scan. */ if (!BlockNumberIsValid(blkno)) break; next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* * release the lock on previous page after acquiring the lock on next * page */ if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); buf = next_buf; } /* * lock the bucket page to clear the garbage flag and squeeze the bucket. * if the current buffer is same as bucket buffer, then we already have * lock on bucket page. */ if (buf != bucket_buf) { _hash_relbuf(rel, buf); LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); } /* * Clear the garbage flag from bucket after deleting the tuples that are * moved by split. We purposefully clear the flag before squeeze bucket, * so that after restart, vacuum shouldn't again try to delete the moved * by split tuples. */ if (split_cleanup) { HashPageOpaque bucket_opaque; Page page; page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; MarkBufferDirty(bucket_buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); PageSetLSN(page, recptr); } END_CRIT_SECTION(); } /* * If we have deleted anything, try to compact free space. For squeezing * the bucket, we must have a cleanup lock, else it can impact the * ordering of tuples for a scan that has started before it. */ if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); else LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); }
/* * Delete a posting tree page. */ static void ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno, BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot) { Buffer dBuffer; Buffer lBuffer; Buffer pBuffer; Page page, parentPage; BlockNumber rightlink; /* * Lock the pages in the same order as an insertion would, to avoid * deadlocks: left, then right, then parent. */ lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, RBM_NORMAL, gvs->strategy); dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno, RBM_NORMAL, gvs->strategy); pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno, RBM_NORMAL, gvs->strategy); LockBuffer(lBuffer, GIN_EXCLUSIVE); LockBuffer(dBuffer, GIN_EXCLUSIVE); if (!isParentRoot) /* parent is already locked by * LockBufferForCleanup() */ LockBuffer(pBuffer, GIN_EXCLUSIVE); START_CRIT_SECTION(); /* Unlink the page by changing left sibling's rightlink */ page = BufferGetPage(dBuffer); rightlink = GinPageGetOpaque(page)->rightlink; page = BufferGetPage(lBuffer); GinPageGetOpaque(page)->rightlink = rightlink; /* Delete downlink from parent */ parentPage = BufferGetPage(pBuffer); #ifdef USE_ASSERT_CHECKING do { PostingItem *tod = GinDataPageGetPostingItem(parentPage, myoff); Assert(PostingItemGetBlockNumber(tod) == deleteBlkno); } while (0); #endif GinPageDeletePostingItem(parentPage, myoff); page = BufferGetPage(dBuffer); /* * we shouldn't change rightlink field to save workability of running * search scan */ GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(pBuffer); MarkBufferDirty(lBuffer); MarkBufferDirty(dBuffer); if (RelationNeedsWAL(gvs->index)) { XLogRecPtr recptr; ginxlogDeletePage data; /* * We can't pass REGBUF_STANDARD for the deleted page, because we * didn't set pd_lower on pre-9.4 versions. The page might've been * binary-upgraded from an older version, and hence not have pd_lower * set correctly. Ditto for the left page, but removing the item from * the parent updated its pd_lower, so we know that's OK at this * point. */ XLogBeginInsert(); XLogRegisterBuffer(0, dBuffer, 0); XLogRegisterBuffer(1, pBuffer, REGBUF_STANDARD); XLogRegisterBuffer(2, lBuffer, 0); data.parentOffset = myoff; data.rightLink = GinPageGetOpaque(page)->rightlink; XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_PAGE); PageSetLSN(page, recptr); PageSetLSN(parentPage, recptr); PageSetLSN(BufferGetPage(lBuffer), recptr); } if (!isParentRoot) LockBuffer(pBuffer, GIN_UNLOCK); ReleaseBuffer(pBuffer); UnlockReleaseBuffer(lBuffer); UnlockReleaseBuffer(dBuffer); END_CRIT_SECTION(); gvs->result->pages_deleted++; }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * This function also deletes the tuples that are moved by split to other * bucket. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf = InvalidBuffer; HashMetaPage metap; HashMetaPage cachedmetap; tuples_removed = 0; num_index_tuples = 0; /* * We need a copy of the metapage so that we can use its hashm_spares[] * values to compute bucket page addresses, but a cached copy should be * good enough. (If not, we'll detect that further down and refresh the * cache as necessary.) */ cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); Assert(cachedmetap != NULL); orig_maxbucket = cachedmetap->hashm_maxbucket; orig_ntuples = cachedmetap->hashm_ntuples; /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; Buffer bucket_buf; Buffer buf; HashPageOpaque bucket_opaque; Page page; bool split_cleanup = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); blkno = bucket_blkno; /* * We need to acquire a cleanup lock on the primary bucket page to out * wait concurrent scans before deleting the dead tuples. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _hash_checkpage(rel, buf, LH_BUCKET_PAGE); page = BufferGetPage(buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If the bucket contains tuples that are moved by split, then we need * to delete such tuples. We can't delete such tuples if the split * operation on bucket is not finished as those are needed by scans. */ if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) { split_cleanup = true; /* * This bucket might have been split since we last held a lock on * the metapage. If so, hashm_maxbucket, hashm_highmask and * hashm_lowmask might be old enough to cause us to fail to remove * tuples left behind by the most recent split. To prevent that, * now that the primary page of the target bucket has been locked * (and thus can't be further split), check whether we need to * update our cached metapage data. */ Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber); if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) { cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); Assert(cachedmetap != NULL); } } bucket_buf = buf; hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, cachedmetap->hashm_maxbucket, cachedmetap->hashm_highmask, cachedmetap->hashm_lowmask, &tuples_removed, &num_index_tuples, split_cleanup, callback, callback_state); _hash_dropbuf(rel, bucket_buf); /* Advance to next bucket */ cur_bucket++; } if (BufferIsInvalid(metabuf)) metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); /* Write-lock metapage and check for split since we started */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); Assert(cachedmetap != NULL); cur_maxbucket = cachedmetap->hashm_maxbucket; goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ START_CRIT_SECTION(); if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } MarkBufferDirty(metabuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_update_meta_page xlrec; XLogRecPtr recptr; xlrec.ntuples = metap->hashm_ntuples; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); _hash_relbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ return stats; }
/* * Clean up redirect and placeholder tuples on the given page * * Redirect tuples can be marked placeholder once they're old enough. * Placeholder tuples can be removed if it won't change the offsets of * non-placeholder ones. * * Unlike the routines above, this works on both leaf and inner pages. */ static void vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); SpGistPageOpaque opaque = SpGistPageGetOpaque(page); OffsetNumber i, max = PageGetMaxOffsetNumber(page), firstPlaceholder = InvalidOffsetNumber; bool hasNonPlaceholder = false; bool hasUpdate = false; OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber itemnos[MaxIndexTuplesPerPage]; spgxlogVacuumRedirect xlrec; xlrec.nToPlaceholder = 0; xlrec.newestRedirectXid = InvalidTransactionId; START_CRIT_SECTION(); /* * Scan backwards to convert old redirection tuples to placeholder tuples, * and identify location of last non-placeholder tuple while at it. */ for (i = max; i >= FirstOffsetNumber && (opaque->nRedirection > 0 || !hasNonPlaceholder); i--) { SpGistDeadTuple dt; dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i)); if (dt->tupstate == SPGIST_REDIRECT && TransactionIdPrecedes(dt->xid, RecentGlobalXmin)) { dt->tupstate = SPGIST_PLACEHOLDER; Assert(opaque->nRedirection > 0); opaque->nRedirection--; opaque->nPlaceholder++; /* remember newest XID among the removed redirects */ if (!TransactionIdIsValid(xlrec.newestRedirectXid) || TransactionIdPrecedes(xlrec.newestRedirectXid, dt->xid)) xlrec.newestRedirectXid = dt->xid; ItemPointerSetInvalid(&dt->pointer); itemToPlaceholder[xlrec.nToPlaceholder] = i; xlrec.nToPlaceholder++; hasUpdate = true; } if (dt->tupstate == SPGIST_PLACEHOLDER) { if (!hasNonPlaceholder) firstPlaceholder = i; } else { hasNonPlaceholder = true; } } /* * Any placeholder tuples at the end of page can safely be removed. We * can't remove ones before the last non-placeholder, though, because we * can't alter the offset numbers of non-placeholder tuples. */ if (firstPlaceholder != InvalidOffsetNumber) { /* * We do not store this array to rdata because it's easy to recreate. */ for (i = firstPlaceholder; i <= max; i++) itemnos[i - firstPlaceholder] = i; i = max - firstPlaceholder + 1; Assert(opaque->nPlaceholder >= i); opaque->nPlaceholder -= i; /* The array is surely sorted, so can use PageIndexMultiDelete */ PageIndexMultiDelete(page, itemnos, i); hasUpdate = true; } xlrec.firstPlaceholder = firstPlaceholder; if (hasUpdate) MarkBufferDirty(buffer); if (hasUpdate && RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRedirect); XLogRegisterData((char *) itemToPlaceholder, sizeof(OffsetNumber) * xlrec.nToPlaceholder); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/* * Write the index tuples contained in *collector into the index's * pending list. * * Function guarantees that all these tuples will be inserted consecutively, * preserving order */ void ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) { Relation index = ginstate->index; Buffer metabuffer; Page metapage; GinMetaPageData *metadata = NULL; Buffer buffer = InvalidBuffer; Page page = NULL; ginxlogUpdateMeta data; bool separateList = false; bool needCleanup = false; int cleanupSize; bool needWal; if (collector->ntuples == 0) return; needWal = RelationNeedsWAL(index); data.node = index->rd_node; data.ntuples = 0; data.newRightlink = data.prevTail = InvalidBlockNumber; metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) { /* * Total size is greater than one page => make sublist */ separateList = true; } else { LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber || collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize) { /* * Pending list is empty or total size is greater than freespace * on tail page => make sublist * * We unlock metabuffer to keep high concurrency */ separateList = true; LockBuffer(metabuffer, GIN_UNLOCK); } } if (separateList) { /* * We should make sublist separately and append it to the tail */ GinMetaPageData sublist; memset(&sublist, 0, sizeof(GinMetaPageData)); makeSublist(index, collector->tuples, collector->ntuples, &sublist); if (needWal) XLogBeginInsert(); /* * metapage was unlocked, see above */ LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber) { /* * Main list is empty, so just insert sublist as main list */ START_CRIT_SECTION(); metadata->head = sublist.head; metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages = sublist.nPendingPages; metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; } else { /* * Merge lists */ data.prevTail = metadata->tail; data.newRightlink = sublist.head; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); START_CRIT_SECTION(); GinPageGetOpaque(page)->rightlink = sublist.head; MarkBufferDirty(buffer); metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages += sublist.nPendingPages; metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; if (needWal) XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); } } else { /* * Insert into tail page. Metapage is already locked */ OffsetNumber l, off; int i, tupsize; char *ptr; char *collectordata; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); collectordata = ptr = (char *) palloc(collector->sumsize); data.ntuples = collector->ntuples; if (needWal) XLogBeginInsert(); START_CRIT_SECTION(); /* * Increase counter of heap tuples */ Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples); GinPageGetOpaque(page)->maxoff++; metadata->nPendingHeapTuples++; for (i = 0; i < collector->ntuples; i++) { tupsize = IndexTupleSize(collector->tuples[i]); l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(index)); memcpy(ptr, collector->tuples[i], tupsize); ptr += tupsize; off++; } Assert((ptr - collectordata) <= collector->sumsize); if (needWal) { XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); XLogRegisterBufData(1, collectordata, collector->sumsize); } metadata->tailFreeSize = PageGetExactFreeSpace(page); MarkBufferDirty(buffer); } /* * Write metabuffer, make xlog entry */ MarkBufferDirty(metabuffer); if (needWal) { XLogRecPtr recptr; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE); PageSetLSN(metapage, recptr); if (buffer != InvalidBuffer) { PageSetLSN(page, recptr); } } if (buffer != InvalidBuffer) UnlockReleaseBuffer(buffer); /* * Force pending list cleanup when it becomes too long. And, * ginInsertCleanup could take significant amount of time, so we prefer to * call it when it can do all the work in a single collection cycle. In * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it * while pending list is still small enough to fit into * gin_pending_list_limit. * * ginInsertCleanup() should not be called inside our CRIT_SECTION. */ cleanupSize = GinGetPendingListCleanupSize(index); if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * 1024L) needCleanup = true; UnlockReleaseBuffer(metabuffer); END_CRIT_SECTION(); if (needCleanup) ginInsertCleanup(ginstate, false, true, NULL); }
/* * Build a pending-list page from the given array of tuples, and write it out. * * Returns amount of free space left on the page. */ static int32 writeListPage(Relation index, Buffer buffer, IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) { Page page = BufferGetPage(buffer); int32 i, freesize, size = 0; OffsetNumber l, off; char *workspace; char *ptr; /* workspace could be a local array; we use palloc for alignment */ workspace = palloc(BLCKSZ); START_CRIT_SECTION(); GinInitBuffer(buffer, GIN_LIST); off = FirstOffsetNumber; ptr = workspace; for (i = 0; i < ntuples; i++) { int this_size = IndexTupleSize(tuples[i]); memcpy(ptr, tuples[i], this_size); ptr += this_size; size += this_size; l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(index)); off++; } Assert(size <= BLCKSZ); /* else we overran workspace */ GinPageGetOpaque(page)->rightlink = rightlink; /* * tail page may contain only whole row(s) or final part of row placed on * previous pages (a "row" here meaning all the index tuples generated for * one heap tuple) */ if (rightlink == InvalidBlockNumber) { GinPageSetFullRow(page); GinPageGetOpaque(page)->maxoff = 1; } else { GinPageGetOpaque(page)->maxoff = 0; } MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { ginxlogInsertListPage data; XLogRecPtr recptr; data.rightlink = rightlink; data.ntuples = ntuples; XLogBeginInsert(); XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage)); XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); XLogRegisterBufData(0, workspace, size); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE); PageSetLSN(page, recptr); } /* get free space before releasing buffer */ freesize = PageGetExactFreeSpace(page); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); pfree(workspace); return freesize; }
/* * Deletes pending list pages up to (not including) newHead page. * If newHead == InvalidBlockNumber then function drops the whole list. * * metapage is pinned and exclusive-locked throughout this function. * * Returns true if another cleanup process is running concurrently * (if so, we can just abandon our own efforts) */ static bool shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, bool fill_fsm, IndexBulkDeleteResult *stats) { Page metapage; GinMetaPageData *metadata; BlockNumber blknoToDelete; metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); blknoToDelete = metadata->head; do { Page page; int i; int64 nDeletedHeapTuples = 0; ginxlogDeleteListPages data; Buffer buffers[GIN_NDELETE_AT_ONCE]; BlockNumber freespace[GIN_NDELETE_AT_ONCE]; data.ndeleted = 0; while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) { freespace[data.ndeleted] = blknoToDelete; buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); page = BufferGetPage(buffers[data.ndeleted]); data.ndeleted++; if (GinPageIsDeleted(page)) { /* concurrent cleanup process is detected */ for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); return true; } nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; blknoToDelete = GinPageGetOpaque(page)->rightlink; } if (stats) stats->pages_deleted += data.ndeleted; /* * This operation touches an unusually large number of pages, so * prepare the XLogInsert machinery for that before entering the * critical section. */ if (RelationNeedsWAL(index)) XLogEnsureRecordSpace(data.ndeleted, 0); START_CRIT_SECTION(); metadata->head = blknoToDelete; Assert(metadata->nPendingPages >= data.ndeleted); metadata->nPendingPages -= data.ndeleted; Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); metadata->nPendingHeapTuples -= nDeletedHeapTuples; if (blknoToDelete == InvalidBlockNumber) { metadata->tail = InvalidBlockNumber; metadata->tailFreeSize = 0; metadata->nPendingPages = 0; metadata->nPendingHeapTuples = 0; } MarkBufferDirty(metabuffer); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(buffers[i]); } if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); for (i = 0; i < data.ndeleted; i++) XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT); memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogRegisterData((char *) &data, sizeof(ginxlogDeleteListPages)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE); PageSetLSN(metapage, recptr); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); PageSetLSN(page, recptr); } } for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); END_CRIT_SECTION(); for (i = 0; fill_fsm && i < data.ndeleted; i++) RecordFreeIndexPage(index, freespace[i]); } while (blknoToDelete != newHead); return false; }
/* * Write a backup block if needed when we are setting a hint. Note that * this may be called for a variety of page types, not just heaps. * * Callable while holding just share lock on the buffer content. * * We can't use the plain backup block mechanism since that relies on the * Buffer being exclusively locked. Since some modifications (setting LSN, hint * bits) are allowed in a sharelocked buffer that can lead to wal checksum * failures. So instead we copy the page and insert the copied data as normal * record data. * * We only need to do something if page has not yet been full page written in * this checkpoint round. The LSN of the inserted wal record is returned if we * had to write, InvalidXLogRecPtr otherwise. * * It is possible that multiple concurrent backends could attempt to write WAL * records. In that case, multiple copies of the same block would be recorded * in separate WAL records by different backends, though that is still OK from * a correctness perspective. */ XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std) { XLogRecPtr recptr = InvalidXLogRecPtr; XLogRecPtr lsn; XLogRecPtr RedoRecPtr; /* * Ensure no checkpoint can change our view of RedoRecPtr. */ Assert(MyPgXact->delayChkpt); /* * Update RedoRecPtr so that we can make the right decision */ RedoRecPtr = GetRedoRecPtr(); /* * We assume page LSN is first data on *every* page that can be passed to * XLogInsert, whether it has the standard page layout or not. Since we're * only holding a share-lock on the page, we must take the buffer header * lock when we look at the LSN. */ lsn = BufferGetLSNAtomic(buffer); if (lsn <= RedoRecPtr) { int flags; char copied_buffer[BLCKSZ]; char *origdata = (char *) BufferGetBlock(buffer); RelFileNode rnode; ForkNumber forkno; BlockNumber blkno; /* * Copy buffer so we don't have to worry about concurrent hint bit or * lsn updates. We assume pd_lower/upper cannot be changed without an * exclusive lock, so the contents bkp are not racy. */ if (buffer_std) { /* Assume we can omit data between pd_lower and pd_upper */ Page page = BufferGetPage(buffer); uint16 lower = ((PageHeader) page)->pd_lower; uint16 upper = ((PageHeader) page)->pd_upper; memcpy(copied_buffer, origdata, lower); memcpy(copied_buffer + upper, origdata + upper, BLCKSZ - upper); } else memcpy(copied_buffer, origdata, BLCKSZ); XLogBeginInsert(); flags = REGBUF_FORCE_IMAGE; if (buffer_std) flags |= REGBUF_STANDARD; BufferGetTag(buffer, &rnode, &forkno, &blkno); XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer, flags); recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT); } return recptr; }
/* * Drop a table space * * Be careful to check that the tablespace is empty. */ void DropTableSpace(DropTableSpaceStmt *stmt) { #ifdef HAVE_SYMLINK char *tablespacename = stmt->tablespacename; HeapScanDesc scandesc; Relation rel; HeapTuple tuple; ScanKeyData entry[1]; Oid tablespaceoid; /* * Find the target tuple */ rel = heap_open(TableSpaceRelationId, RowExclusiveLock); ScanKeyInit(&entry[0], Anum_pg_tablespace_spcname, BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(tablespacename)); scandesc = heap_beginscan_catalog(rel, 1, entry); tuple = heap_getnext(scandesc, ForwardScanDirection); if (!HeapTupleIsValid(tuple)) { if (!stmt->missing_ok) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("tablespace \"%s\" does not exist", tablespacename))); } else { ereport(NOTICE, (errmsg("tablespace \"%s\" does not exist, skipping", tablespacename))); /* XXX I assume I need one or both of these next two calls */ heap_endscan(scandesc); heap_close(rel, NoLock); } return; } tablespaceoid = HeapTupleGetOid(tuple); /* Must be tablespace owner */ if (!pg_tablespace_ownercheck(tablespaceoid, GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_TABLESPACE, tablespacename); /* Disallow drop of the standard tablespaces, even by superuser */ if (tablespaceoid == GLOBALTABLESPACE_OID || tablespaceoid == DEFAULTTABLESPACE_OID) aclcheck_error(ACLCHECK_NO_PRIV, ACL_KIND_TABLESPACE, tablespacename); /* DROP hook for the tablespace being removed */ InvokeObjectDropHook(TableSpaceRelationId, tablespaceoid, 0); /* * Remove the pg_tablespace tuple (this will roll back if we fail below) */ simple_heap_delete(rel, &tuple->t_self); heap_endscan(scandesc); /* * Remove any comments or security labels on this tablespace. */ DeleteSharedComments(tablespaceoid, TableSpaceRelationId); DeleteSharedSecurityLabel(tablespaceoid, TableSpaceRelationId); /* * Remove dependency on owner. */ deleteSharedDependencyRecordsFor(TableSpaceRelationId, tablespaceoid, 0); /* * Acquire TablespaceCreateLock to ensure that no TablespaceCreateDbspace * is running concurrently. */ LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* * Try to remove the physical infrastructure. */ if (!destroy_tablespace_directories(tablespaceoid, false)) { /* * Not all files deleted? However, there can be lingering empty files * in the directories, left behind by for example DROP TABLE, that * have been scheduled for deletion at next checkpoint (see comments * in mdunlink() for details). We could just delete them immediately, * but we can't tell them apart from important data files that we * mustn't delete. So instead, we force a checkpoint which will clean * out any lingering files, and try again. * * XXX On Windows, an unlinked file persists in the directory listing * until no process retains an open handle for the file. The DDL * commands that schedule files for unlink send invalidation messages * directing other PostgreSQL processes to close the files. DROP * TABLESPACE should not give up on the tablespace becoming empty * until all relevant invalidation processing is complete. */ RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); if (!destroy_tablespace_directories(tablespaceoid, false)) { /* Still not empty, the files must be important then */ ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("tablespace \"%s\" is not empty", tablespacename))); } } /* Record the filesystem change in XLOG */ { xl_tblspc_drop_rec xlrec; xlrec.ts_id = tablespaceoid; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xl_tblspc_drop_rec)); (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_DROP); } /* * Note: because we checked that the tablespace was empty, there should be * no need to worry about flushing shared buffers or free space map * entries for relations in the tablespace. */ /* * Force synchronous commit, to minimize the window between removing the * files on-disk and marking the transaction committed. It's not great * that there is any window at all, but definitely we don't want to make * it larger than necessary. */ ForceSyncCommit(); /* * Allow TablespaceCreateDbspace again. */ LWLockRelease(TablespaceCreateLock); /* We keep the lock on pg_tablespace until commit */ heap_close(rel, NoLock); #else /* !HAVE_SYMLINK */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif /* HAVE_SYMLINK */ }
IndexBuildResult * ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum key; GinNullCategory category; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; Page page; XLogBeginInsert(); XLogRegisterBuffer(0, MetaBuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBuffer(1, RootBuffer, REGBUF_WILL_INIT); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* count the root as first entry page */ buildstate.buildStats.nEntryPages++; /* * create a temporary memory context that is used to hold data not yet * dumped out to the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_SIZES); /* * create a temporary memory context that is used for calling * ginExtractEntries(), and can be reset after each tuple */ buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_SIZES); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate, NULL); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetBAEntry(&buildstate.accum, &attnum, &key, &category, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(&buildstate.ginstate, attnum, key, category, list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.funcCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Update metapage stats */ buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; return result; }
/* * Main entry point to GiST index build. Initially calls insert over and over, * but switches to more efficient buffering build algorithm after a certain * number of tuples (unless buffering mode is disabled). */ IndexBuildResult * gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) { IndexBuildResult *result; double reltuples; GISTBuildState buildstate; Buffer buffer; Page page; MemoryContext oldcxt = CurrentMemoryContext; int fillfactor; buildstate.indexrel = index; if (index->rd_options) { /* Get buffering mode from the options string */ GiSTOptions *options = (GiSTOptions *) index->rd_options; char *bufferingMode = (char *) options + options->bufferingModeOffset; if (strcmp(bufferingMode, "on") == 0) buildstate.bufferingMode = GIST_BUFFERING_STATS; else if (strcmp(bufferingMode, "off") == 0) buildstate.bufferingMode = GIST_BUFFERING_DISABLED; else buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = options->fillfactor; } else { /* * By default, switch to buffering mode when the index grows too large * to fit in cache. */ buildstate.bufferingMode = GIST_BUFFERING_AUTO; fillfactor = GIST_DEFAULT_FILLFACTOR; } /* Calculate target amount of free space to leave on pages */ buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; /* * We expect to be called exactly once for any index relation. If that's * not the case, big trouble's what we have. */ if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* no locking is needed */ buildstate.giststate = initGISTstate(index); /* * Create a temporary memory context that is reset once for each tuple * processed. (Note: we don't bother to make this a child of the * giststate's scanCxt, so we have to delete it separately at the end.) */ buildstate.giststate->tempCxt = createTempGistContext(); /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); page = BufferGetPage(buffer); START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT); recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX); PageSetLSN(page, recptr); } else PageSetLSN(page, gistGetFakeLSN(heap)); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; buildstate.indtuplesSize = 0; /* * Do the heap scan. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, gistBuildCallback, (void *) &buildstate); /* * If buffering was used, flush out all the tuples that are still in the * buffers. */ if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) { elog(DEBUG1, "all tuples processed, emptying buffers"); gistEmptyAllBuffers(&buildstate); gistFreeBuildBuffers(buildstate.gfbb); } /* okay, all heap tuples are indexed */ MemoryContextSwitchTo(oldcxt); MemoryContextDelete(buildstate.giststate->tempCxt); freeGISTstate(buildstate.giststate); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; return result; }