/* * Allocate a new page (either by recycling, or by extending the index file). * * The returned buffer is already pinned and exclusive-locked. * Caller is responsible for initializing the page by calling SpGistInitBuffer. */ Buffer SpGistNewBuffer(Relation index) { Buffer buffer; bool needLock; /* First, try to get a page from FSM */ for (;;) { BlockNumber blkno = GetFreeIndexPage(index); if (blkno == InvalidBlockNumber) break; /* nothing known to FSM */ /* * The fixed pages shouldn't ever be listed in FSM, but just in case * one is, ignore it. */ if (SpGistBlockIsFixed(blkno)) continue; buffer = ReadBuffer(index, blkno); /* * We have to guard against the possibility that someone else already * recycled this page; the buffer may be locked if so. */ if (ConditionalLockBuffer(buffer)) { Page page = BufferGetPage(buffer); if (PageIsNew(page)) return buffer; /* OK to use, if never initialized */ if (SpGistPageIsDeleted(page) || PageIsEmpty(page)) return buffer; /* OK to use */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } /* Can't use it, so release buffer and try again */ ReleaseBuffer(buffer); } /* Must extend the file */ needLock = !RELATION_IS_LOCAL(index); if (needLock) LockRelationForExtension(index, ExclusiveLock); buffer = ReadBuffer(index, P_NEW); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); return buffer; }
/* * Allocate a new page (either by recycling, or by extending the index file) * * The returned buffer is already pinned and exclusive-locked * * Caller is responsible for initializing the page by calling GISTInitBuffer */ Buffer gistNewBuffer(Relation r) { Buffer buffer; bool needLock; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; /* First, try to get a page from FSM */ for (;;) { BlockNumber blkno = GetFreeIndexPage(&r->rd_node); if (blkno == InvalidBlockNumber) break; /* nothing left in FSM */ buffer = ReadBuffer(r, blkno); /* * We have to guard against the possibility that someone else already * recycled this page; the buffer may be locked if so. */ if (ConditionalLockBuffer(buffer)) { Page page = BufferGetPage(buffer); if (PageIsNew(page)) return buffer; /* OK to use, if never initialized */ gistcheckpage(r, buffer); if (GistPageIsDeleted(page)) return buffer; /* OK to use */ LockBuffer(buffer, GIST_UNLOCK); } /* Can't use it, so release buffer and try again */ ReleaseBuffer(buffer); } /* Must extend the file */ needLock = !RELATION_IS_LOCAL(r); if (needLock) LockRelationForExtension(r, ExclusiveLock); buffer = ReadBuffer(r, P_NEW); LockBuffer(buffer, GIST_EXCLUSIVE); if (needLock) UnlockRelationForExtension(r, ExclusiveLock); return buffer; }
Buffer GinNewBuffer(Relation index) { Buffer buffer; bool needLock; /* First, try to get a page from FSM */ for (;;) { BlockNumber blkno = GetFreeIndexPage(index); if (blkno == InvalidBlockNumber) break; buffer = ReadBuffer(index, blkno); /* * We have to guard against the possibility that someone else already * recycled this page; the buffer may be locked if so. */ if (ConditionalLockBuffer(buffer)) { Page page = BufferGetPage(buffer); if (PageIsNew(page)) return buffer; /* OK to use, if never initialized */ if (GinPageIsDeleted(page)) return buffer; /* OK to use */ LockBuffer(buffer, GIN_UNLOCK); } /* Can't use it, so release buffer and try again */ ReleaseBuffer(buffer); } /* Must extend the file */ needLock = !RELATION_IS_LOCAL(index); if (needLock) LockRelationForExtension(index, ExclusiveLock); buffer = ReadBuffer(index, P_NEW); LockBuffer(buffer, GIN_EXCLUSIVE); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); return buffer; }
/* * _bitmap_getbuf() -- return the buffer for the given block number and * the access method. */ Buffer _bitmap_getbuf(Relation rel, BlockNumber blkno, int access) { Buffer buf; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; if (blkno != P_NEW) { buf = ReadBuffer(rel, blkno); if (access != BM_NOLOCK) LockBuffer(buf, access); } else { bool needLock; Assert(access == BM_WRITE); /* * Extend the relation by one page. * * We have to use a lock to ensure no one else is extending the rel at * the same time, else we will both try to initialize the same new * page. We can skip locking for new or temp relations, however, * since no one else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); if (needLock) LockRelationForExtension(rel, ExclusiveLock); buf = ReadBuffer(rel, P_NEW); /* Acquire buffer lock on new page */ LockBuffer(buf, BM_WRITE); /* * Release the file-extension lock; it's now OK for someone else to * extend the relation some more. */ if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); } return buf; }
/* * _bt_mergeload - Merge two streams of index tuples into new index files. */ static void _bt_mergeload(Spooler *self, BTWriteState *wstate, BTSpool *btspool, BTReader *btspool2, Relation heapRel) { BTPageState *state = NULL; IndexTuple itup, itup2; bool should_free = false; TupleDesc tupdes = RelationGetDescr(wstate->index); int keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey; ON_DUPLICATE on_duplicate = self->on_duplicate; Assert(btspool != NULL); /* the preparation of merge */ itup = BTSpoolGetNextItem(btspool, NULL, &should_free); itup2 = BTReaderGetNextItem(btspool2); indexScanKey = _bt_mkscankey_nodata(wstate->index); for (;;) { bool load1 = true; /* load BTSpool next ? */ bool hasnull; int32 compare; if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { compare = compare_indextuple(itup, itup2, indexScanKey, keysz, tupdes, &hasnull); if (compare == 0 && !hasnull && btspool->isunique) { ItemPointerData t_tid2; /* * t_tid is update by heap_is_visible(), because use it for an * index, t_tid backup */ ItemPointerCopy(&itup2->t_tid, &t_tid2); /* The tuple pointed by the old index should not be visible. */ if (!heap_is_visible(heapRel, &itup->t_tid)) { itup = BTSpoolGetNextItem(btspool, itup, &should_free); } else if (!heap_is_visible(heapRel, &itup2->t_tid)) { itup2 = BTReaderGetNextItem(btspool2); } else { if (on_duplicate == ON_DUPLICATE_KEEP_NEW) { self->dup_old++; remove_duplicate(self, heapRel, itup2, RelationGetRelationName(wstate->index)); itup2 = BTReaderGetNextItem(btspool2); } else { ItemPointerCopy(&t_tid2, &itup2->t_tid); self->dup_new++; remove_duplicate(self, heapRel, itup, RelationGetRelationName(wstate->index)); itup = BTSpoolGetNextItem(btspool, itup, &should_free); } } continue; } else if (compare > 0) load1 = false; } else load1 = false; BULKLOAD_PROFILE(&prof_merge_unique); /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { IndexTuple next_itup = NULL; bool next_should_free = false; for (;;) { /* get next item */ next_itup = BTSpoolGetNextItem(btspool, next_itup, &next_should_free); if (!btspool->isunique || next_itup == NULL) break; compare = compare_indextuple(itup, next_itup, indexScanKey, keysz, tupdes, &hasnull); if (compare < 0 || hasnull) break; if (compare > 0) { /* shouldn't happen */ elog(ERROR, "faild in tuplesort_performsort"); } /* * If tupple is deleted by other unique indexes, not visible */ if (!heap_is_visible(heapRel, &next_itup->t_tid)) { continue; } if (!heap_is_visible(heapRel, &itup->t_tid)) { if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; next_should_free = false; continue; } /* not unique between input files */ self->dup_new++; remove_duplicate(self, heapRel, next_itup, RelationGetRelationName(wstate->index)); if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); } _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; } else { _bt_buildadd(wstate, state, itup2); itup2 = BTReaderGetNextItem(btspool2); } BULKLOAD_PROFILE(&prof_merge_insert); } _bt_freeskey(indexScanKey); /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index isn't temp, we must fsync it down to disk before it's safe * to commit the transaction. (For a temp index we don't care since the * index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (!RELATION_IS_LOCAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } BULKLOAD_PROFILE(&prof_merge_term); }
static void _bt_mergebuild(Spooler *self, BTSpool *btspool) { Relation heapRel = self->relinfo->ri_RelationDesc; BTWriteState wstate; BTReader reader; bool merge; Assert(btspool->index->rd_index->indisvalid); tuplesort_performsort(btspool->sortstate); wstate.index = btspool->index; /* * We need to log index creation in WAL iff WAL archiving is enabled AND * it's not a temp index. */ wstate.btws_use_wal = self->use_wal && XLogArchivingActive() && !RELATION_IS_LOCAL(wstate.index); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; wstate.btws_pages_written = 0; wstate.btws_zeropage = NULL; /* until needed */ /* * Flush dirty buffers so that we will read the index files directly * in order to get pre-existing data. We must acquire AccessExclusiveLock * for the target table for calling FlushRelationBuffer(). */ LockRelation(wstate.index, AccessExclusiveLock); FlushRelationBuffers(wstate.index); BULKLOAD_PROFILE(&prof_flush); merge = BTReaderInit(&reader, wstate.index); elog(DEBUG1, "pg_bulkload: build \"%s\" %s merge (%s wal)", RelationGetRelationName(wstate.index), merge ? "with" : "without", wstate.btws_use_wal ? "with" : "without"); /* Assign a new file node. */ RelationSetNewRelfilenode(wstate.index, InvalidTransactionId); if (merge || (btspool->isunique && self->max_dup_errors > 0)) { /* Merge two streams into the new file node that we assigned. */ BULKLOAD_PROFILE_PUSH(); _bt_mergeload(self, &wstate, btspool, &reader, heapRel); BULKLOAD_PROFILE_POP(); BULKLOAD_PROFILE(&prof_merge); } else { /* Fast path for newly created index. */ _bt_load(&wstate, btspool, NULL); BULKLOAD_PROFILE(&prof_index); } BTReaderTerm(&reader); }
/* * VACUUM cleanup: update FSM */ IndexBulkDeleteResult * gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) { Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; bool needLock; /* No-op in ANALYZE ONLY mode */ if (info->analyze_only) return stats; /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* use heap's tuple count */ stats->num_index_tuples = info->num_heap_tuples; stats->estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just * return NULL, or is there work we must do below? */ } /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(rel); /* try to find deleted pages */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); totFreePages = 0; for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page)) { totFreePages++; RecordFreeIndexPage(rel, blkno); } UnlockReleaseBuffer(buffer); } /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); /* return statistics */ stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); return stats; }
/* * Try to extend the revmap by one page. This might not happen for a number of * reasons; caller is expected to retry until the expected outcome is obtained. */ static void revmap_physical_extend(BrinRevmap *revmap) { Buffer buf; Page page; Page metapage; BrinMetaPageData *metadata; BlockNumber mapBlk; BlockNumber nblocks; Relation irel = revmap->rm_irel; bool needLock = !RELATION_IS_LOCAL(irel); /* * Lock the metapage. This locks out concurrent extensions of the revmap, * but note that we still need to grab the relation extension lock because * another backend can extend the index with regular BRIN pages. */ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE); metapage = BufferGetPage(revmap->rm_metaBuf); metadata = (BrinMetaPageData *) PageGetContents(metapage); /* * Check that our cached lastRevmapPage value was up-to-date; if it * wasn't, update the cached copy and have caller start over. */ if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage) { revmap->rm_lastRevmapPage = metadata->lastRevmapPage; LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); return; } mapBlk = metadata->lastRevmapPage + 1; nblocks = RelationGetNumberOfBlocks(irel); if (mapBlk < nblocks) { buf = ReadBuffer(irel, mapBlk); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); } else { if (needLock) LockRelationForExtension(irel, ExclusiveLock); buf = ReadBuffer(irel, P_NEW); if (BufferGetBlockNumber(buf) != mapBlk) { /* * Very rare corner case: somebody extended the relation * concurrently after we read its length. If this happens, give * up and have caller start over. We will have to evacuate that * page from under whoever is using it. */ if (needLock) UnlockRelationForExtension(irel, ExclusiveLock); LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); return; } LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (needLock) UnlockRelationForExtension(irel, ExclusiveLock); } /* Check that it's a regular block (or an empty page) */ if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u", BRIN_PAGE_TYPE(page), RelationGetRelationName(irel), BufferGetBlockNumber(buf)))); /* If the page is in use, evacuate it and restart */ if (brin_start_evacuating_page(irel, buf)) { LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf); /* have caller start over */ return; } /* * Ok, we have now locked the metapage and the target block. Re-initialize * it as a revmap page. */ START_CRIT_SECTION(); /* the rm_tids array is initialized to all invalid by PageInit */ brin_page_init(page, BRIN_PAGETYPE_REVMAP); MarkBufferDirty(buf); metadata->lastRevmapPage = mapBlk; MarkBufferDirty(revmap->rm_metaBuf); if (RelationNeedsWAL(revmap->rm_irel)) { xl_brin_revmap_extend xlrec; XLogRecPtr recptr; XLogRecData rdata[2]; xlrec.node = revmap->rm_irel->rd_node; xlrec.targetBlk = mapBlk; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBrinRevmapExtend; rdata[0].buffer = InvalidBuffer; rdata[0].buffer_std = false; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) NULL; rdata[1].len = 0; rdata[1].buffer = revmap->rm_metaBuf; rdata[1].buffer_std = false; rdata[1].next = NULL; recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND, rdata); PageSetLSN(metapage, recptr); PageSetLSN(page, recptr); } END_CRIT_SECTION(); LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(buf); }
Datum gistvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages, nFreePages, *freePages, maxFreePages; BlockNumber lastBlock = GIST_ROOT_BLKNO, lastFilledBlock = GIST_ROOT_BLKNO; bool needLock; /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); /* use heap's tuple count */ Assert(info->num_heap_tuples >= 0); stats->std.num_index_tuples = info->num_heap_tuples; /* * XXX the above is wrong if index is partial. Would it be OK to just * return NULL, or is there work we must do below? */ } /* gistVacuumUpdate may cause hard work */ if (info->vacuum_full) { GistVacuum gv; ArrayTuple res; /* note: vacuum.c already acquired AccessExclusiveLock on index */ gv.index = rel; initGISTstate(&(gv.giststate), rel); gv.opCtx = createTempGistContext(); gv.result = stats; gv.strategy = info->strategy; /* walk through the entire index for update tuples */ res = gistVacuumUpdate(&gv, GIST_ROOT_BLKNO, false); /* cleanup */ if (res.itup) { int i; for (i = 0; i < res.ituplen; i++) pfree(res.itup[i]); pfree(res.itup); } freeGISTstate(&(gv.giststate)); MemoryContextDelete(gv.opCtx); } else if (stats->needFullVacuum) ereport(NOTICE, (errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery", RelationGetRelationName(rel)))); /* * If vacuum full, we already have exclusive lock on the index. Otherwise, * need lock unless it's local to this backend. */ if (info->vacuum_full) needLock = false; else needLock = !RELATION_IS_LOCAL(rel); /* try to find deleted pages */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); maxFreePages = npages; if (maxFreePages > MaxFSMPages) maxFreePages = MaxFSMPages; totFreePages = nFreePages = 0; freePages = (BlockNumber *) palloc(sizeof(BlockNumber) * maxFreePages); for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferWithStrategy(rel, blkno, info->strategy); LockBuffer(buffer, GIST_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page)) { if (nFreePages < maxFreePages) freePages[nFreePages++] = blkno; totFreePages++; } else lastFilledBlock = blkno; UnlockReleaseBuffer(buffer); } lastBlock = npages - 1; if (info->vacuum_full && nFreePages > 0) { /* try to truncate index */ int i; for (i = 0; i < nFreePages; i++) if (freePages[i] >= lastFilledBlock) { totFreePages = nFreePages = i; break; } if (lastBlock > lastFilledBlock) RelationTruncate(rel, lastFilledBlock + 1); stats->std.pages_removed = lastBlock - lastFilledBlock; } RecordIndexFreeSpace(&rel->rd_node, totFreePages, nFreePages, freePages); pfree(freePages); /* return statistics */ stats->std.pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); stats->std.num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); PG_RETURN_POINTER(stats); }
/** * @brief Write block buffer contents. Number of block buffer to be * written is specified by num argument. * * Flow: * <ol> * <li>If no more space is available in the data file, switch to a new one.</li> * <li>Compute block number which can be written to the current file.</li> * <li>Save the last block number in the load status file.</li> * <li>Write to the current file.</li> * <li>If there are other data, write them too.</li> * </ol> * * @param loader [in] Direct Writer. * @return File descriptor for the current data file. */ static void flush_pages(DirectWriter *loader) { int i; int num; LoadStatus *ls = &loader->ls; num = loader->curblk; if (!PageIsEmpty(GetCurrentPage(loader))) num += 1; if (num <= 0) return; /* no work */ /* * Add WAL entry (only the first page) to ensure the current xid will * be recorded in xlog. We must flush some xlog records with XLogFlush() * before write any data blocks to follow the WAL protocol. * * If postgres process, such as loader and COPY, is killed by "kill -9", * database will be rewound to the last checkpoint and recovery will * be performed using WAL. * * After the recovery, if there are xid's which have not been recorded * to WAL, such xid's will be reused. * * However, in the loader and COPY, data file is actually updated and * xid must not be reused. * * WAL entry with such xid can be added using XLogInsert(). However, * such entries are not really written to the disk immediately. * WAL entries are flushed to the disk by XLogFlush(), typically * when a transaction is commited. COPY prevents xid reuse by * this method. */ #if PG_VERSION_NUM >= 90100 if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel) && !(loader->base.rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) ) { XLogRecPtr recptr; recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM, ls->ls.exist_cnt, loader->blocks); XLogFlush(recptr); } #else if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel) ) { XLogRecPtr recptr; recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM, ls->ls.exist_cnt, loader->blocks); XLogFlush(recptr); } #endif /* * Write blocks. We might need to write multiple files on boundary of * relation segments. */ for (i = 0; i < num;) { char *buffer; int total; int written; int flush_num; BlockNumber relblks = LS_TOTAL_CNT(ls); /* Switch to the next file if the current file has been filled up. */ if (relblks % RELSEG_SIZE == 0) close_data_file(loader); if (loader->datafd == -1) loader->datafd = open_data_file(ls->ls.rnode, RELATION_IS_LOCAL(loader->base.rel), relblks); /* Number of blocks to be added to the current file. */ flush_num = Min(num - i, RELSEG_SIZE - relblks % RELSEG_SIZE); Assert(flush_num > 0); /* Write the last block number to the load status file. */ UpdateLSF(loader, flush_num); #if PG_VERSION_NUM >= 90300 /* If we need a checksum, add it */ if (DataChecksumsEnabled()){ int j = 0; Page contained_page; for ( j=0; j<flush_num; j++ ) { contained_page = GetTargetPage(loader,j); ((PageHeader) contained_page)->pd_checksum = pg_checksum_page((char *) contained_page, LS_TOTAL_CNT(ls) - 1 - j); } } #endif /* * Flush flush_num data block to the current file. * Then the current file size becomes RELSEG_SIZE self->blocks. */ buffer = loader->blocks + BLCKSZ * i; total = BLCKSZ * flush_num; written = 0; while (total > 0) { int len = write(loader->datafd, buffer + written, total); if (len == -1) { /* fatal error, do not want to write blocks anymore */ ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to data file: %m"))); } written += len; total -= len; } i += flush_num; } /* * NOTICE: Be sure reset curblk to 0 and reinitialize recycled page * if you will continue to use blocks. */ }
/* * btvacuumscan --- scan the index for VACUUMing purposes * * This combines the functions of looking for leaf tuples that are deletable * according to the vacuum callback, looking for empty pages that can be * deleted, and looking for old deleted pages that can be recycled. Both * btbulkdelete and btvacuumcleanup invoke this (the latter only if no * btbulkdelete call occurred). * * The caller is responsible for initially allocating/zeroing a stats struct * and for obtaining a vacuum cycle ID if necessary. */ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid) { MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE; Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; BlockNumber blkno; bool needLock; MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER; /* * Reset counts that will be incremented during the scan; needed in case * of multiple scans during a single VACUUM command */ stats->num_index_tuples = 0; stats->pages_deleted = 0; /* Set up info to pass down to btvacuumpage */ vstate.info = info; vstate.stats = stats; vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.freePages = NULL; /* temporarily */ vstate.nFreePages = 0; vstate.maxFreePages = 0; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, "_bt_pagedel", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing * read-ahead for speed). It is critical that we visit all leaf pages, * including ones added after we start the scan, else we might fail to * delete some deletable tuples. Hence, we must repeatedly check the * relation length. We must acquire the relation-extension lock while * doing so to avoid a race condition: if someone else is extending the * relation, there is a window where bufmgr/smgr have created a new * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If * we manage to scan such a page here, we'll improperly assume it can be * recycled. Taking the lock synchronizes things enough to prevent a * problem: either num_pages won't include the new page, or _bt_getbuf * already has write lock on the buffer and it will be fully initialized * before we can examine it. (See also vacuumlazy.c, which has the same * issue.) Also, we need not worry if a page is added immediately after * we look; the page splitting code already has write-lock on the left * page before it adds a right page, so we must already have processed any * tuples due to be moved into such a page. * * We can skip locking for new or temp relations, however, since no one * else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); blkno = BTREE_METAPAGE + 1; for (;;) { /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); /* Allocate freePages after we read num_pages the first time */ if (vstate.freePages == NULL) { /* No point in remembering more than MaxFSMPages pages */ vstate.maxFreePages = MaxFSMPages; if ((BlockNumber) vstate.maxFreePages > num_pages) vstate.maxFreePages = (int) num_pages; vstate.freePages = (BlockNumber *) palloc(vstate.maxFreePages * sizeof(BlockNumber)); } /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { btvacuumpage(&vstate, blkno, blkno); } } /* * During VACUUM FULL, we truncate off any recyclable pages at the end of * the index. In a normal vacuum it'd be unsafe to do this except by * acquiring exclusive lock on the index and then rechecking all the * pages; doesn't seem worth it. */ if (info->vacuum_full && vstate.nFreePages > 0) { BlockNumber new_pages = num_pages; while (vstate.nFreePages > 0 && vstate.freePages[vstate.nFreePages - 1] == new_pages - 1) { new_pages--; stats->pages_deleted--; vstate.nFreePages--; vstate.totFreePages = vstate.nFreePages; /* can't be more */ } if (new_pages != num_pages) { /* * Okay to truncate. */ RelationTruncate(rel, new_pages, /* markPersistentAsPhysicallyTruncated */ true); /* update statistics */ stats->pages_removed += num_pages - new_pages; num_pages = new_pages; } } /* * Update the shared Free Space Map with the info we now have about free * pages in the index, discarding any old info the map may have. We do not * need to sort the page numbers; they're in order already. */ RecordIndexFreeSpace(&rel->rd_node, vstate.totFreePages, vstate.nFreePages, vstate.freePages); pfree(vstate.freePages); MemoryContextDelete(vstate.pagedelcontext); /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT; }
Datum ginvacuumcleanup(PG_FUNCTION_ARGS) { MIRROREDLOCK_BUFMGR_DECLARE; IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation index = info->index; bool needLock; BlockNumber npages, blkno; BlockNumber totFreePages, nFreePages, *freePages, maxFreePages; BlockNumber lastBlock = GIN_ROOT_BLKNO, lastFilledBlock = GIN_ROOT_BLKNO; /* Set up all-zero stats if ginbulkdelete wasn't called */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* * XXX we always report the heap tuple count as the number of index * entries. This is bogus if the index is partial, but it's real hard to * tell how many distinct heap entries are referenced by a GIN index. */ stats->num_index_tuples = info->num_heap_tuples; /* * If vacuum full, we already have exclusive lock on the index. Otherwise, * need lock unless it's local to this backend. */ if (info->vacuum_full) needLock = false; else needLock = !RELATION_IS_LOCAL(index); if (needLock) LockRelationForExtension(index, ExclusiveLock); npages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); maxFreePages = npages; if (maxFreePages > MaxFSMPages) maxFreePages = MaxFSMPages; totFreePages = nFreePages = 0; freePages = (BlockNumber *) palloc(sizeof(BlockNumber) * maxFreePages); for (blkno = GIN_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = (Page) BufferGetPage(buffer); if (GinPageIsDeleted(page)) { if (nFreePages < maxFreePages) freePages[nFreePages++] = blkno; totFreePages++; } else lastFilledBlock = blkno; UnlockReleaseBuffer(buffer); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- } lastBlock = npages - 1; if (info->vacuum_full && nFreePages > 0) { /* try to truncate index */ int i; for (i = 0; i < nFreePages; i++) if (freePages[i] >= lastFilledBlock) { totFreePages = nFreePages = i; break; } if (lastBlock > lastFilledBlock) RelationTruncate( index, lastFilledBlock + 1, /* markPersistentAsPhysicallyTruncated */ true); stats->pages_removed = lastBlock - lastFilledBlock; } RecordIndexFreeSpace(&index->rd_node, totFreePages, nFreePages, freePages); stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(index, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); PG_RETURN_POINTER(stats); }
/* * Return a pinned and exclusively locked buffer which can be used to insert an * index item of size itemsz (caller must ensure not to request sizes * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in * an order determined to avoid deadlocks.) * * If we find that the old page is no longer a regular index page (because * of a revmap extension), the old buffer is unlocked and we return * InvalidBuffer. * * If there's no existing page with enough free space to accommodate the new * item, the relation is extended. If this happens, *extended is set to true, * and it is the caller's responsibility to initialize the page (and WAL-log * that fact) prior to use. * * Note that in some corner cases it is possible for this routine to extend the * relation and then not return the buffer. It is this routine's * responsibility to WAL-log the page initialization and to record the page in * FSM if that happens. Such a buffer may later be reused by this routine. */ static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, bool *extended) { BlockNumber oldblk; BlockNumber newblk; Page page; int freespace; /* callers must have checked */ Assert(itemsz <= BrinMaxItemSize); *extended = false; if (BufferIsValid(oldbuf)) oldblk = BufferGetBlockNumber(oldbuf); else oldblk = InvalidBlockNumber; /* * Loop until we find a page with sufficient free space. By the time we * return to caller out of this loop, both buffers are valid and locked; * if we have to restart here, neither buffer is locked and buf is not a * pinned buffer. */ newblk = RelationGetTargetBlock(irel); if (newblk == InvalidBlockNumber) newblk = GetPageWithFreeSpace(irel, itemsz); for (;;) { Buffer buf; bool extensionLockHeld = false; CHECK_FOR_INTERRUPTS(); if (newblk == InvalidBlockNumber) { /* * There's not enough free space in any existing index page, * according to the FSM: extend the relation to obtain a shiny new * page. */ if (!RELATION_IS_LOCAL(irel)) { LockRelationForExtension(irel, ExclusiveLock); extensionLockHeld = true; } buf = ReadBuffer(irel, P_NEW); newblk = BufferGetBlockNumber(buf); *extended = true; BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u", BufferGetBlockNumber(buf))); } else if (newblk == oldblk) { /* * There's an odd corner-case here where the FSM is out-of-date, * and gave us the old page. */ buf = oldbuf; } else { buf = ReadBuffer(irel, newblk); } /* * We lock the old buffer first, if it's earlier than the new one; but * before we do, we need to check that it hasn't been turned into a * revmap page concurrently; if we detect that it happened, give up * and tell caller to start over. */ if (BufferIsValid(oldbuf) && oldblk < newblk) { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); /* * It is possible that the new page was obtained from * extending the relation. In that case, we must be sure to * record it in the FSM before leaving, because otherwise the * space would be lost forever. However, we cannot let an * uninitialized page get in the FSM, so we need to initialize * it first. */ if (*extended) { brin_initialize_empty_new_buffer(irel, buf); /* shouldn't matter, but don't confuse caller */ *extended = false; } if (extensionLockHeld) UnlockRelationForExtension(irel, ExclusiveLock); ReleaseBuffer(buf); return InvalidBuffer; } } LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (extensionLockHeld) UnlockRelationForExtension(irel, ExclusiveLock); page = BufferGetPage(buf); /* * We have a new buffer to insert into. Check that the new page has * enough free space, and return it if it does; otherwise start over. * Note that we allow for the FSM to be out of date here, and in that * case we update it and move on. * * (br_page_get_freespace also checks that the FSM didn't hand us a * page that has since been repurposed for the revmap.) */ freespace = *extended ? BrinMaxItemSize : br_page_get_freespace(page); if (freespace >= itemsz) { RelationSetTargetBlock(irel, BufferGetBlockNumber(buf)); /* * Since the target block specification can get lost on cache * invalidations, make sure we update the more permanent FSM with * data about it before going away. */ if (*extended) RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), freespace); /* * Lock the old buffer if not locked already. Note that in this * case we know for sure it's a regular page: it's later than the * new page we just got, which is not a revmap page, and revmap * pages are always consecutive. */ if (BufferIsValid(oldbuf) && oldblk > newblk) { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); } return buf; } /* This page is no good. */ /* * If an entirely new page does not contain enough free space for the * new item, then surely that item is oversized. Complain loudly; but * first make sure we initialize the page and record it as free, for * next time. */ if (*extended) { brin_initialize_empty_new_buffer(irel, buf); ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) itemsz, (unsigned long) freespace, RelationGetRelationName(irel)))); return InvalidBuffer; /* keep compiler quiet */ } if (newblk != oldblk) UnlockReleaseBuffer(buf); if (BufferIsValid(oldbuf) && oldblk <= newblk) LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); } }
/* * RelationGetBufferForTuple * * Returns pinned and exclusive-locked buffer of a page in given relation * with free space >= given len. * * If otherBuffer is not InvalidBuffer, then it references a previously * pinned buffer of another page in the same relation; on return, this * buffer will also be exclusive-locked. (This case is used by heap_update; * the otherBuffer contains the tuple being updated.) * * The reason for passing otherBuffer is that if two backends are doing * concurrent heap_update operations, a deadlock could occur if they try * to lock the same two buffers in opposite orders. To ensure that this * can't happen, we impose the rule that buffers of a relation must be * locked in increasing page number order. This is most conveniently done * by having RelationGetBufferForTuple lock them both, with suitable care * for ordering. * * NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the * same buffer we select for insertion of the new tuple (this could only * happen if space is freed in that page after heap_update finds there's not * enough there). In that case, the page will be pinned and locked only once. * * If use_fsm is true (the normal case), we use FSM to help us find free * space. If use_fsm is false, we always append a new empty page to the * end of the relation if the tuple won't fit on the current target page. * This can save some cycles when we know the relation is new and doesn't * contain useful amounts of free space. * * The use_fsm = false case is also useful for non-WAL-logged additions to a * relation, if the caller holds exclusive lock and is careful to invalidate * relation->rd_targblock before the first insertion --- that ensures that * all insertions will occur into newly added pages and not be intermixed * with tuples from other transactions. That way, a crash can't risk losing * any committed data of other transactions. (See heap_insert's comments * for additional constraints needed for safe usage of this behavior.) * * We always try to avoid filling existing pages further than the fillfactor. * This is OK since this routine is not consulted when updating a tuple and * keeping it on the same page, which is the scenario fillfactor is meant * to reserve space for. * * ereport(ERROR) is allowed here, so this routine *must* be called * before any (unlogged) changes are made in buffer pool. */ Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, bool use_fsm) { Buffer buffer = InvalidBuffer; Page pageHeader; Size pageFreeSpace, saveFreeSpace; BlockNumber targetBlock, otherBlock; bool needLock; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; len = MAXALIGN(len); /* be conservative */ /* * If we're gonna fail for oversize tuple, do it right away */ if (len > MaxHeapTupleSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %lu, maximum size %lu", (unsigned long) len, (unsigned long) MaxHeapTupleSize))); /* Compute desired extra freespace due to fillfactor option */ saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); if (otherBuffer != InvalidBuffer) otherBlock = BufferGetBlockNumber(otherBuffer); else otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */ /* * We first try to put the tuple on the same page we last inserted a tuple * on, as cached in the relcache entry. If that doesn't work, we ask the * shared Free Space Map to locate a suitable page. Since the FSM's info * might be out of date, we have to be prepared to loop around and retry * multiple times. (To insure this isn't an infinite loop, we must update * the FSM with the correct amount of free space on each page that proves * not to be suitable.) If the FSM has no record of a page with enough * free space, we give up and extend the relation. * * When use_fsm is false, we either put the tuple onto the existing target * page or extend the relation. */ if (len + saveFreeSpace <= MaxHeapTupleSize) targetBlock = relation->rd_targblock; else { /* can't fit, don't screw up FSM request tracking by trying */ targetBlock = InvalidBlockNumber; use_fsm = false; } if (targetBlock == InvalidBlockNumber && use_fsm) { /* * We have no cached target page, so ask the FSM for an initial * target. */ targetBlock = GetPageWithFreeSpace(&relation->rd_node, len + saveFreeSpace); /* * If the FSM knows nothing of the rel, try the last page before we * give up and extend. This avoids one-tuple-per-page syndrome during * bootstrapping or in a recently-started system. */ if (targetBlock == InvalidBlockNumber) { BlockNumber nblocks = RelationGetNumberOfBlocks(relation); if (nblocks > 0) targetBlock = nblocks - 1; } } while (targetBlock != InvalidBlockNumber) { /* * Read and exclusive-lock the target block, as well as the other * block if one was given, taking suitable care with lock ordering and * the possibility they are the same block. */ if (otherBuffer == InvalidBuffer) { /* easy case */ buffer = ReadBuffer(relation, targetBlock); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } else if (otherBlock == targetBlock) { /* also easy case */ buffer = otherBuffer; LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } else if (otherBlock < targetBlock) { /* lock other buffer first */ buffer = ReadBuffer(relation, targetBlock); LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } else { /* lock target buffer first */ buffer = ReadBuffer(relation, targetBlock); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); } /* * Now we can check to see if there's enough free space here. If so, * we're done. */ pageHeader = (Page) BufferGetPage(buffer); pageFreeSpace = PageGetFreeSpace(pageHeader); if (len + saveFreeSpace <= pageFreeSpace) { /* use this page as future insert target, too */ relation->rd_targblock = targetBlock; return buffer; } /* * Not enough space, so we must give up our page locks and pin (if * any) and prepare to look elsewhere. We don't care which order we * unlock the two buffers in, so this can be slightly simpler than the * code above. */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); if (otherBuffer == InvalidBuffer) ReleaseBuffer(buffer); else if (otherBlock != targetBlock) { LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } /* Without FSM, always fall out of the loop and extend */ if (!use_fsm) break; /* * Update FSM as to condition of this page, and ask for another page * to try. */ targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node, targetBlock, pageFreeSpace, len + saveFreeSpace); } /* * Have to extend the relation. * * We have to use a lock to ensure no one else is extending the rel at the * same time, else we will both try to initialize the same new page. We * can skip locking for new or temp relations, however, since no one else * could be accessing them. */ needLock = !RELATION_IS_LOCAL(relation); if (needLock) LockRelationForExtension(relation, ExclusiveLock); /* * XXX This does an lseek - rather expensive - but at the moment it is the * only way to accurately determine how many blocks are in a relation. Is * it worth keeping an accurate file length in shared memory someplace, * rather than relying on the kernel to do it for us? */ buffer = ReadBuffer(relation, P_NEW); /* * We can be certain that locking the otherBuffer first is OK, since it * must have a lower page number. */ if (otherBuffer != InvalidBuffer) LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); /* * Now acquire lock on the new page. */ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Release the file-extension lock; it's now OK for someone else to extend * the relation some more. Note that we cannot release this lock before * we have buffer lock on the new page, or we risk a race condition * against vacuumlazy.c --- see comments therein. */ if (needLock) UnlockRelationForExtension(relation, ExclusiveLock); /* * We need to initialize the empty new page. Double-check that it really * is empty (this should never happen, but if it does we don't want to * risk wiping out valid data). */ pageHeader = (Page) BufferGetPage(buffer); if (!PageIsNew((PageHeader) pageHeader)) elog(ERROR, "page %u of relation \"%s\" should be empty but is not", BufferGetBlockNumber(buffer), RelationGetRelationName(relation)); PageInit(pageHeader, BufferGetPageSize(buffer), 0); if (len > PageGetFreeSpace(pageHeader)) { /* We should not get here given the test at the top */ elog(PANIC, "tuple is too big: size %lu", (unsigned long) len); } /* * Remember the new page as our target for future insertions. * * XXX should we enter the new page into the free space map immediately, * or just keep it for this backend's exclusive use in the short run * (until VACUUM sees it)? Seems to depend on whether you expect the * current backend to make more insertions or not, which is probably a * good bet most of the time. So for now, don't add it to FSM yet. */ relation->rd_targblock = BufferGetBlockNumber(buffer); return buffer; }
/* * VACUUM cleanup: update FSM */ Datum gistvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; BlockNumber lastBlock = GIST_ROOT_BLKNO, lastFilledBlock = GIST_ROOT_BLKNO; bool needLock; /* No-op in ANALYZE ONLY mode */ if (info->analyze_only) PG_RETURN_POINTER(stats); /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); /* use heap's tuple count */ stats->std.num_index_tuples = info->num_heap_tuples; stats->std.estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just * return NULL, or is there work we must do below? */ } if (stats->needReindex) ereport(NOTICE, (errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery", RelationGetRelationName(rel)))); /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(rel); /* try to find deleted pages */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); totFreePages = 0; for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page)) { totFreePages++; RecordFreeIndexPage(rel, blkno); } else lastFilledBlock = blkno; UnlockReleaseBuffer(buffer); } lastBlock = npages - 1; /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); /* return statistics */ stats->std.pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); stats->std.num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); PG_RETURN_POINTER(stats); }
Datum ginvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation index = info->index; bool needLock; BlockNumber npages, blkno; BlockNumber totFreePages; GinState ginstate; GinStatsData idxStat; /* * In an autovacuum analyze, we want to clean up pending insertions. * Otherwise, an ANALYZE-only call is a no-op. */ if (info->analyze_only) { if (IsAutoVacuumWorkerProcess()) { initGinState(&ginstate, index); ginInsertCleanup(&ginstate, true, stats); } PG_RETURN_POINTER(stats); } /* * Set up all-zero stats and cleanup pending inserts if ginbulkdelete * wasn't called */ if (stats == NULL) { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); initGinState(&ginstate, index); ginInsertCleanup(&ginstate, true, stats); } memset(&idxStat, 0, sizeof(idxStat)); /* * XXX we always report the heap tuple count as the number of index * entries. This is bogus if the index is partial, but it's real hard to * tell how many distinct heap entries are referenced by a GIN index. */ stats->num_index_tuples = info->num_heap_tuples; stats->estimated_count = info->estimated_count; /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(index); if (needLock) LockRelationForExtension(index, ExclusiveLock); npages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); totFreePages = 0; for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIN_SHARE); page = (Page) BufferGetPage(buffer); if (GinPageIsDeleted(page)) { Assert(blkno != GIN_ROOT_BLKNO); RecordFreeIndexPage(index, blkno); totFreePages++; } else if (GinPageIsData(page)) { idxStat.nDataPages++; } else if (!GinPageIsList(page)) { idxStat.nEntryPages++; if (GinPageIsLeaf(page)) idxStat.nEntries += PageGetMaxOffsetNumber(page); } UnlockReleaseBuffer(buffer); } /* Update the metapage with accurate page and entry counts */ idxStat.nTotalPages = npages; ginUpdateStats(info->index, &idxStat); /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(index, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); PG_RETURN_POINTER(stats); }
/* * _bt_getbuf() -- Get a buffer by block number for read or write. * * blkno == P_NEW means to get an unallocated index page. The page * will be initialized before returning it. * * When this routine returns, the appropriate lock is set on the * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). Also, we apply * _bt_checkpage to sanity-check the page (except in P_NEW case). */ Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access) { Buffer buf; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; if (blkno != P_NEW) { /* Read an existing block of the relation */ buf = ReadBuffer(rel, blkno); LockBuffer(buf, access); _bt_checkpage(rel, buf); } else { bool needLock; Page page; Assert(access == BT_WRITE); /* * First see if the FSM knows of any free pages. * * We can't trust the FSM's report unreservedly; we have to check that * the page is still free. (For example, an already-free page could * have been re-used between the time the last VACUUM scanned it and * the time the VACUUM made its FSM updates.) * * In fact, it's worse than that: we can't even assume that it's safe * to take a lock on the reported page. If somebody else has a lock * on it, or even worse our own caller does, we could deadlock. (The * own-caller scenario is actually not improbable. Consider an index * on a serial or timestamp column. Nearly all splits will be at the * rightmost page, so it's entirely likely that _bt_split will call us * while holding a lock on the page most recently acquired from FSM. A * VACUUM running concurrently with the previous split could well have * placed that page back in FSM.) * * To get around that, we ask for only a conditional lock on the * reported page. If we fail, then someone else is using the page, * and we may reasonably assume it's not free. (If we happen to be * wrong, the worst consequence is the page will be lost to use till * the next VACUUM, which is no big problem.) */ for (;;) { blkno = GetFreeIndexPage(&rel->rd_node); if (blkno == InvalidBlockNumber) break; buf = ReadBuffer(rel, blkno); if (ConditionalLockBuffer(buf)) { page = BufferGetPage(buf); if (_bt_page_recyclable(page)) { /* Okay to use page. Re-initialize and return it */ _bt_pageinit(page, BufferGetPageSize(buf)); return buf; } elog(DEBUG2, "FSM returned nonrecyclable page"); _bt_relbuf(rel, buf); } else { elog(DEBUG2, "FSM returned nonlockable page"); /* couldn't get lock, so just drop pin */ ReleaseBuffer(buf); } } /* * Extend the relation by one page. * * We have to use a lock to ensure no one else is extending the rel at * the same time, else we will both try to initialize the same new * page. We can skip locking for new or temp relations, however, * since no one else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); if (needLock) LockRelationForExtension(rel, ExclusiveLock); buf = ReadBuffer(rel, P_NEW); /* Acquire buffer lock on new page */ LockBuffer(buf, BT_WRITE); /* * Release the file-extension lock; it's now OK for someone else to * extend the relation some more. Note that we cannot release this * lock before we have buffer lock on the new page, or we risk a race * condition against btvacuumscan --- see comments therein. */ if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); /* Initialize the new page before returning it */ page = BufferGetPage(buf); Assert(PageIsNew((PageHeader) page)); _bt_pageinit(page, BufferGetPageSize(buf)); } /* ref count and lock type are correct */ return buf; }
/* * btvacuumscan --- scan the index for VACUUMing purposes * * This combines the functions of looking for leaf tuples that are deletable * according to the vacuum callback, looking for empty pages that can be * deleted, and looking for old deleted pages that can be recycled. Both * btbulkdelete and btvacuumcleanup invoke this (the latter only if no * btbulkdelete call occurred). * * The caller is responsible for initially allocating/zeroing a stats struct * and for obtaining a vacuum cycle ID if necessary. */ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid) { Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; BlockNumber blkno; bool needLock; /* * Reset counts that will be incremented during the scan; needed in case * of multiple scans during a single VACUUM command */ stats->estimated_count = false; stats->num_index_tuples = 0; stats->pages_deleted = 0; /* Set up info to pass down to btvacuumpage */ vstate.info = info; vstate.stats = stats; vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastUsedPage = BTREE_METAPAGE; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, "_bt_pagedel", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing * read-ahead for speed). It is critical that we visit all leaf pages, * including ones added after we start the scan, else we might fail to * delete some deletable tuples. Hence, we must repeatedly check the * relation length. We must acquire the relation-extension lock while * doing so to avoid a race condition: if someone else is extending the * relation, there is a window where bufmgr/smgr have created a new * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If * we manage to scan such a page here, we'll improperly assume it can be * recycled. Taking the lock synchronizes things enough to prevent a * problem: either num_pages won't include the new page, or _bt_getbuf * already has write lock on the buffer and it will be fully initialized * before we can examine it. (See also vacuumlazy.c, which has the same * issue.) Also, we need not worry if a page is added immediately after * we look; the page splitting code already has write-lock on the left * page before it adds a right page, so we must already have processed any * tuples due to be moved into such a page. * * We can skip locking for new or temp relations, however, since no one * else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); blkno = BTREE_METAPAGE + 1; for (;;) { /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { btvacuumpage(&vstate, blkno, blkno); } } /* * InHotStandby we need to scan right up to the end of the index for * correct locking, so we may need to write a WAL record for the final * block in the index if it was not vacuumed. It's possible that VACUUMing * has actually removed zeroed pages at the end of the index so we need to * take care to issue the record for last actual block and not for the * last block that was scanned. Ignore empty indexes. */ if (XLogStandbyInfoActive() && num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1)) { Buffer buf; /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a * nondefault buffer access strategy. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } MemoryContextDelete(vstate.pagedelcontext); /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; }
/* * btvacuumscan --- scan the index for VACUUMing purposes * * This combines the functions of looking for leaf tuples that are deletable * according to the vacuum callback, looking for empty pages that can be * deleted, and looking for old deleted pages that can be recycled. Both * btbulkdelete and btvacuumcleanup invoke this (the latter only if no * btbulkdelete call occurred). * * The caller is responsible for initially allocating/zeroing a stats struct * and for obtaining a vacuum cycle ID if necessary. */ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid) { Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; BlockNumber blkno; bool needLock; /* * Reset counts that will be incremented during the scan; needed in case * of multiple scans during a single VACUUM command */ stats->estimated_count = false; stats->num_index_tuples = 0; stats->pages_deleted = 0; /* Set up info to pass down to btvacuumpage */ vstate.info = info; vstate.stats = stats; vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastBlockLocked = BTREE_METAPAGE; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, "_bt_pagedel", ALLOCSET_DEFAULT_SIZES); /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing * read-ahead for speed). It is critical that we visit all leaf pages, * including ones added after we start the scan, else we might fail to * delete some deletable tuples. Hence, we must repeatedly check the * relation length. We must acquire the relation-extension lock while * doing so to avoid a race condition: if someone else is extending the * relation, there is a window where bufmgr/smgr have created a new * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If * we manage to scan such a page here, we'll improperly assume it can be * recycled. Taking the lock synchronizes things enough to prevent a * problem: either num_pages won't include the new page, or _bt_getbuf * already has write lock on the buffer and it will be fully initialized * before we can examine it. (See also vacuumlazy.c, which has the same * issue.) Also, we need not worry if a page is added immediately after * we look; the page splitting code already has write-lock on the left * page before it adds a right page, so we must already have processed any * tuples due to be moved into such a page. * * We can skip locking for new or temp relations, however, since no one * else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); blkno = BTREE_METAPAGE + 1; for (;;) { /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { btvacuumpage(&vstate, blkno, blkno); } } /* * Check to see if we need to issue one final WAL record for this index, * which may be needed for correctness on a hot standby node when non-MVCC * index scans could take place. * * If the WAL is replayed in hot standby, the replay process needs to get * cleanup locks on all index leaf pages, just as we've been doing here. * However, we won't issue any WAL records about pages that have no items * to be deleted. For pages between pages we've vacuumed, the replay code * will take locks under the direction of the lastBlockVacuumed fields in * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record * against the last leaf page in the index, if that one wasn't vacuumed. */ if (XLogStandbyInfoActive() && vstate.lastBlockVacuumed < vstate.lastBlockLocked) { Buffer buf; /* * The page should be valid, but we can't use _bt_getbuf() because we * want to use a nondefault buffer access strategy. Since we aren't * going to delete any items, getting cleanup lock again is probably * overkill, but for consistency do that anyway. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_checkpage(rel, buf); _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } MemoryContextDelete(vstate.pagedelcontext); /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; }
/* * RelationGetBufferForTuple * * Returns pinned and exclusive-locked buffer of a page in given relation * with free space >= given len. * * If otherBuffer is not InvalidBuffer, then it references a previously * pinned buffer of another page in the same relation; on return, this * buffer will also be exclusive-locked. (This case is used by heap_update; * the otherBuffer contains the tuple being updated.) * * The reason for passing otherBuffer is that if two backends are doing * concurrent heap_update operations, a deadlock could occur if they try * to lock the same two buffers in opposite orders. To ensure that this * can't happen, we impose the rule that buffers of a relation must be * locked in increasing page number order. This is most conveniently done * by having RelationGetBufferForTuple lock them both, with suitable care * for ordering. * * NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the * same buffer we select for insertion of the new tuple (this could only * happen if space is freed in that page after heap_update finds there's not * enough there). In that case, the page will be pinned and locked only once. * * For the vmbuffer and vmbuffer_other arguments, we avoid deadlock by * locking them only after locking the corresponding heap page, and taking * no further lwlocks while they are locked. * * We normally use FSM to help us find free space. However, * if HEAP_INSERT_SKIP_FSM is specified, we just append a new empty page to * the end of the relation if the tuple won't fit on the current target page. * This can save some cycles when we know the relation is new and doesn't * contain useful amounts of free space. * * HEAP_INSERT_SKIP_FSM is also useful for non-WAL-logged additions to a * relation, if the caller holds exclusive lock and is careful to invalidate * relation's smgr_targblock before the first insertion --- that ensures that * all insertions will occur into newly added pages and not be intermixed * with tuples from other transactions. That way, a crash can't risk losing * any committed data of other transactions. (See heap_insert's comments * for additional constraints needed for safe usage of this behavior.) * * The caller can also provide a BulkInsertState object to optimize many * insertions into the same relation. This keeps a pin on the current * insertion target page (to save pin/unpin cycles) and also passes a * BULKWRITE buffer selection strategy object to the buffer manager. * Passing NULL for bistate selects the default behavior. * * We always try to avoid filling existing pages further than the fillfactor. * This is OK since this routine is not consulted when updating a tuple and * keeping it on the same page, which is the scenario fillfactor is meant * to reserve space for. * * ereport(ERROR) is allowed here, so this routine *must* be called * before any (unlogged) changes are made in buffer pool. */ Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, Buffer *vmbuffer, Buffer *vmbuffer_other) { bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM); Buffer buffer = InvalidBuffer; Page page; Size pageFreeSpace, saveFreeSpace; BlockNumber targetBlock, otherBlock; bool needLock; len = MAXALIGN(len); /* be conservative */ /* Bulk insert is not supported for updates, only inserts. */ Assert(otherBuffer == InvalidBuffer || !bistate); /* * If we're gonna fail for oversize tuple, do it right away */ if (len > MaxHeapTupleSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %lu, maximum size %lu", (unsigned long) len, (unsigned long) MaxHeapTupleSize))); /* Compute desired extra freespace due to fillfactor option */ saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); if (otherBuffer != InvalidBuffer) otherBlock = BufferGetBlockNumber(otherBuffer); else otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */ /* * We first try to put the tuple on the same page we last inserted a tuple * on, as cached in the BulkInsertState or relcache entry. If that * doesn't work, we ask the Free Space Map to locate a suitable page. * Since the FSM's info might be out of date, we have to be prepared to * loop around and retry multiple times. (To insure this isn't an infinite * loop, we must update the FSM with the correct amount of free space on * each page that proves not to be suitable.) If the FSM has no record of * a page with enough free space, we give up and extend the relation. * * When use_fsm is false, we either put the tuple onto the existing target * page or extend the relation. */ if (len + saveFreeSpace > MaxHeapTupleSize) { /* can't fit, don't bother asking FSM */ targetBlock = InvalidBlockNumber; use_fsm = false; } else if (bistate && bistate->current_buf != InvalidBuffer) targetBlock = BufferGetBlockNumber(bistate->current_buf); else targetBlock = RelationGetTargetBlock(relation); if (targetBlock == InvalidBlockNumber && use_fsm) { /* * We have no cached target page, so ask the FSM for an initial * target. */ targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); /* * If the FSM knows nothing of the rel, try the last page before we * give up and extend. This avoids one-tuple-per-page syndrome during * bootstrapping or in a recently-started system. */ if (targetBlock == InvalidBlockNumber) { BlockNumber nblocks = RelationGetNumberOfBlocks(relation); if (nblocks > 0) targetBlock = nblocks - 1; } } while (targetBlock != InvalidBlockNumber) { /* * Read and exclusive-lock the target block, as well as the other * block if one was given, taking suitable care with lock ordering and * the possibility they are the same block. * * If the page-level all-visible flag is set, caller will need to * clear both that and the corresponding visibility map bit. However, * by the time we return, we'll have x-locked the buffer, and we don't * want to do any I/O while in that state. So we check the bit here * before taking the lock, and pin the page if it appears necessary. * Checking without the lock creates a risk of getting the wrong * answer, so we'll have to recheck after acquiring the lock. */ if (otherBuffer == InvalidBuffer) { /* easy case */ buffer = ReadBufferBI(relation, targetBlock, bistate); if (PageIsAllVisible(BufferGetPage(buffer))) visibilitymap_pin(relation, targetBlock, vmbuffer); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } else if (otherBlock == targetBlock) { /* also easy case */ buffer = otherBuffer; if (PageIsAllVisible(BufferGetPage(buffer))) visibilitymap_pin(relation, targetBlock, vmbuffer); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } else if (otherBlock < targetBlock) { /* lock other buffer first */ buffer = ReadBuffer(relation, targetBlock); if (PageIsAllVisible(BufferGetPage(buffer))) visibilitymap_pin(relation, targetBlock, vmbuffer); LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); } else { /* lock target buffer first */ buffer = ReadBuffer(relation, targetBlock); if (PageIsAllVisible(BufferGetPage(buffer))) visibilitymap_pin(relation, targetBlock, vmbuffer); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); } /* * We now have the target page (and the other buffer, if any) pinned * and locked. However, since our initial PageIsAllVisible checks * were performed before acquiring the lock, the results might now be * out of date, either for the selected victim buffer, or for the * other buffer passed by the caller. In that case, we'll need to * give up our locks, go get the pin(s) we failed to get earlier, and * re-lock. That's pretty painful, but hopefully shouldn't happen * often. * * Note that there's a small possibility that we didn't pin the page * above but still have the correct page pinned anyway, either because * we've already made a previous pass through this loop, or because * caller passed us the right page anyway. * * Note also that it's possible that by the time we get the pin and * retake the buffer locks, the visibility map bit will have been * cleared by some other backend anyway. In that case, we'll have * done a bit of extra work for no gain, but there's no real harm * done. */ if (otherBuffer == InvalidBuffer || buffer <= otherBuffer) GetVisibilityMapPins(relation, buffer, otherBuffer, targetBlock, otherBlock, vmbuffer, vmbuffer_other); else GetVisibilityMapPins(relation, otherBuffer, buffer, otherBlock, targetBlock, vmbuffer_other, vmbuffer); /* * Now we can check to see if there's enough free space here. If so, * we're done. */ page = BufferGetPage(buffer); pageFreeSpace = PageGetHeapFreeSpace(page); if (len + saveFreeSpace <= pageFreeSpace) { /* use this page as future insert target, too */ RelationSetTargetBlock(relation, targetBlock); return buffer; } /* * Not enough space, so we must give up our page locks and pin (if * any) and prepare to look elsewhere. We don't care which order we * unlock the two buffers in, so this can be slightly simpler than the * code above. */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); if (otherBuffer == InvalidBuffer) ReleaseBuffer(buffer); else if (otherBlock != targetBlock) { LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } /* Without FSM, always fall out of the loop and extend */ if (!use_fsm) break; /* * Update FSM as to condition of this page, and ask for another page * to try. */ targetBlock = RecordAndGetPageWithFreeSpace(relation, targetBlock, pageFreeSpace, len + saveFreeSpace); } /* * Have to extend the relation. * * We have to use a lock to ensure no one else is extending the rel at the * same time, else we will both try to initialize the same new page. We * can skip locking for new or temp relations, however, since no one else * could be accessing them. */ needLock = !RELATION_IS_LOCAL(relation); if (needLock) LockRelationForExtension(relation, ExclusiveLock); /* * XXX This does an lseek - rather expensive - but at the moment it is the * only way to accurately determine how many blocks are in a relation. Is * it worth keeping an accurate file length in shared memory someplace, * rather than relying on the kernel to do it for us? */ buffer = ReadBufferBI(relation, P_NEW, bistate); /* * We can be certain that locking the otherBuffer first is OK, since it * must have a lower page number. */ if (otherBuffer != InvalidBuffer) LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); /* * Now acquire lock on the new page. */ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Release the file-extension lock; it's now OK for someone else to extend * the relation some more. Note that we cannot release this lock before * we have buffer lock on the new page, or we risk a race condition * against vacuumlazy.c --- see comments therein. */ if (needLock) UnlockRelationForExtension(relation, ExclusiveLock); /* * We need to initialize the empty new page. Double-check that it really * is empty (this should never happen, but if it does we don't want to * risk wiping out valid data). */ page = BufferGetPage(buffer); if (!PageIsNew(page)) elog(ERROR, "page %u of relation \"%s\" should be empty but is not", BufferGetBlockNumber(buffer), RelationGetRelationName(relation)); PageInit(page, BufferGetPageSize(buffer), 0); if (len > PageGetHeapFreeSpace(page)) { /* We should not get here given the test at the top */ elog(PANIC, "tuple is too big: size %lu", (unsigned long) len); } /* * Remember the new page as our target for future insertions. * * XXX should we enter the new page into the free space map immediately, * or just keep it for this backend's exclusive use in the short run * (until VACUUM sees it)? Seems to depend on whether you expect the * current backend to make more insertions or not, which is probably a * good bet most of the time. So for now, don't add it to FSM yet. */ RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer)); return buffer; }