/* * _hash_getnewbuf() -- Get a new page at the end of the index. * * This has the same API as _hash_getinitbuf, except that we are adding * a page to the index, and hence expect the page to be past the * logical EOF. (However, we have to support the case where it isn't, * since a prior try might have crashed after extending the filesystem * EOF but before updating the metapage to reflect the added page.) * * It is caller's responsibility to ensure that only one process can * extend the index at a time. In practice, this function is called * only while holding write lock on the metapage, because adding a page * is always associated with an update of metapage data. */ Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum) { BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum); Buffer buf; if (blkno == P_NEW) elog(ERROR, "hash AM does not use P_NEW"); if (blkno > nblocks) elog(ERROR, "access to noncontiguous page in hash index \"%s\"", RelationGetRelationName(rel)); /* smgr insists we use P_NEW to extend the relation */ if (blkno == nblocks) { buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL); if (BufferGetBlockNumber(buf) != blkno) elog(ERROR, "unexpected hash relation size: %u, should be %u", BufferGetBlockNumber(buf), blkno); LockBuffer(buf, HASH_WRITE); } else { buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK, NULL); } /* ref count and lock type are correct */ /* initialize the page */ _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf)); return buf; }
/* * get_raw_page * * Returns a copy of a page from shared buffers as a bytea, with hole * filled with zeros or simply without hole, with the length of the page * offset to be able to reconstitute the page entirely using the data * returned by this function. */ Datum get_raw_page(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); uint32 blkno = PG_GETARG_UINT32(1); bool with_hole = PG_GETARG_BOOL(2); bytea *raw_page; Relation rel; char raw_page_data[BLCKSZ]; Buffer buf; TupleDesc tupdesc; Datum result; Datum values[2]; bool nulls[2]; HeapTuple tuple; PageHeader page_header; int16 hole_offset, hole_length; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw functions")))); rel = relation_open(relid, AccessShareLock); /* Check that this relation has storage */ if (rel->rd_rel->relkind == RELKIND_VIEW) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot get raw page from view \"%s\"", RelationGetRelationName(rel)))); if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot get raw page from composite type \"%s\"", RelationGetRelationName(rel)))); if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot get raw page from foreign table \"%s\"", RelationGetRelationName(rel)))); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); if (blkno >= RelationGetNumberOfBlocksInFork(rel, MAIN_FORKNUM)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("block number %u is out of range for relation \"%s\"", blkno, RelationGetRelationName(rel)))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); /* Take a copy of the page to work on */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, NULL); LockBuffer(buf, BUFFER_LOCK_SHARE); memcpy(raw_page_data, BufferGetPage(buf), BLCKSZ); LockBuffer(buf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buf); relation_close(rel, AccessShareLock); page_header = (PageHeader) raw_page_data; hole_length = page_header->pd_upper - page_header->pd_lower; hole_offset = page_header->pd_lower; /* * If hole is wanted in the page returned, fill it with zeros. * If not, copy to the return buffer the page without the hole. */ if (with_hole) { raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); memcpy(VARDATA(raw_page), raw_page_data, BLCKSZ); MemSet(raw_page_data + hole_offset, 0, hole_length); } else { raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ - hole_length); SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ - hole_length); memcpy(VARDATA(raw_page), raw_page_data, hole_offset); memcpy(VARDATA(raw_page) + hole_offset, raw_page_data + hole_offset + hole_length, BLCKSZ - (hole_offset + hole_length)); } /* Build and return the tuple. */ values[0] = PointerGetDatum(raw_page); if (with_hole) values[1] = UInt16GetDatum(0); else values[1] = UInt16GetDatum(hole_offset); memset(nulls, 0, sizeof(nulls)); tuple = heap_form_tuple(tupdesc, values, nulls); result = HeapTupleGetDatum(tuple); PG_RETURN_DATUM(result); }
/* * pg_prewarm(regclass, mode text, fork text, * first_block int8, last_block int8) * * The first argument is the relation to be prewarmed; the second controls * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'. * The third is the name of the relation fork to be prewarmed. The fourth * and fifth arguments specify the first and last block to be prewarmed. * If the fourth argument is NULL, it will be taken as 0; if the fifth argument * is NULL, it will be taken as the number of blocks in the relation. The * return value is the number of blocks successfully prewarmed. */ Datum pg_prewarm(PG_FUNCTION_ARGS) { Oid relOid; text *forkName; text *type; int64 first_block; int64 last_block; int64 nblocks; int64 blocks_done = 0; int64 block; Relation rel; ForkNumber forkNumber; char *forkString; char *ttype; PrewarmType ptype; AclResult aclresult; /* Basic sanity checking. */ if (PG_ARGISNULL(0)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("relation cannot be null"))); relOid = PG_GETARG_OID(0); if (PG_ARGISNULL(1)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("prewarm type cannot be null")))); type = PG_GETARG_TEXT_P(1); ttype = text_to_cstring(type); if (strcmp(ttype, "prefetch") == 0) ptype = PREWARM_PREFETCH; else if (strcmp(ttype, "read") == 0) ptype = PREWARM_READ; else if (strcmp(ttype, "buffer") == 0) ptype = PREWARM_BUFFER; else { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid prewarm type"), errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\"."))); PG_RETURN_INT64(0); /* Placate compiler. */ } if (PG_ARGISNULL(2)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("relation fork cannot be null")))); forkName = PG_GETARG_TEXT_P(2); forkString = text_to_cstring(forkName); forkNumber = forkname_to_number(forkString); /* Open relation and check privileges. */ rel = relation_open(relOid, AccessShareLock); aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, get_rel_name(relOid)); /* Check that the fork exists. */ RelationOpenSmgr(rel); if (!smgrexists(rel->rd_smgr, forkNumber)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("fork \"%s\" does not exist for this relation", forkString))); /* Validate block numbers, or handle nulls. */ nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber); if (PG_ARGISNULL(3)) first_block = 0; else { first_block = PG_GETARG_INT64(3); if (first_block < 0 || first_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("starting block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } if (PG_ARGISNULL(4)) last_block = nblocks - 1; else { last_block = PG_GETARG_INT64(4); if (last_block < 0 || last_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ending block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } /* Now we're ready to do the real work. */ if (ptype == PREWARM_PREFETCH) { #ifdef USE_PREFETCH /* * In prefetch mode, we just hint the OS to read the blocks, but we * don't know whether it really does it, and we don't wait for it to * finish. * * It would probably be better to pass our prefetch requests in chunks * of a megabyte or maybe even a whole segment at a time, but there's * no practical way to do that at present without a gross modularity * violation, so we just do this. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); PrefetchBuffer(rel, forkNumber, block); ++blocks_done; } #else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("prefetch is not supported by this build"))); #endif } else if (ptype == PREWARM_READ) { /* * In read mode, we actually read the blocks, but not into shared * buffers. This is more portable than prefetch mode (it works * everywhere) and is synchronous. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); smgrread(rel->rd_smgr, forkNumber, block, blockbuffer); ++blocks_done; } } else if (ptype == PREWARM_BUFFER) { /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { Buffer buf; CHECK_FOR_INTERRUPTS(); buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_done; } } /* Close relation, release lock. */ relation_close(rel, AccessShareLock); PG_RETURN_INT64(blocks_done); }
/* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) { HashMetaPage metap; HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; Page pg; int32 data_width; int32 item_width; int32 ffactor; double dnumbuckets; uint32 num_buckets; uint32 log2_num_buckets; uint32 i; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the * next power of 2, however, and always force at least 2 bucket pages. The * upper limit is determined by considerations explained in * _hash_expandtable(). */ dnumbuckets = num_tuples / ffactor; if (dnumbuckets <= 2.0) num_buckets = 2; else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets); log2_num_buckets = _hash_log2(num_buckets); Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); pg = BufferGetPage(metabuf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; metap = HashPageGetMeta(pg); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = HashGetMaxBitmapSize(pg); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); /* * Label the index with its primary hash support function's OID. This is * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the index with N buckets, 0 .. N-1, occupying physical * blocks 1 to N. The first freespace bitmap page is in block N+1. Since * N is a power of 2, we can set the masks this way: */ metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; metap->hashm_highmask = (num_buckets << 1) - 1; MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ metap->hashm_spares[log2_num_buckets] = 1; metap->hashm_ovflpoint = log2_num_buckets; metap->hashm_firstfree = 0; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* * Initialize the first N buckets */ for (i = 0; i < num_buckets; i++) { /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; _hash_wrtbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* * Initialize first bitmap page */ _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); /* all done */ _hash_wrtbuf(rel, metabuf); return num_buckets; }
/* * _hash_init() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { Buffer metabuf; Buffer buf; Buffer bitmapbuf; Page pg; HashMetaPage metap; RegProcedure procid; int32 data_width; int32 item_width; int32 ffactor; uint32 num_buckets; uint32 i; bool use_wal; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * WAL log creation of pages if the relation is persistent, or this is the * init fork. Init forks for unlogged relations always need to be WAL * logged. */ use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. * * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); MarkBufferDirty(metabuf); pg = BufferGetPage(metabuf); metap = HashPageGetMeta(pg); /* XLOG stuff */ if (use_wal) { xl_hash_init_meta_page xlrec; XLogRecPtr recptr; xlrec.num_tuples = num_tuples; xlrec.procid = metap->hashm_procid; xlrec.ffactor = metap->hashm_ffactor; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } num_buckets = metap->hashm_maxbucket + 1; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* * Initialize and WAL Log the first N buckets */ for (i = 0; i < num_buckets; i++) { BlockNumber blkno; /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); blkno = BUCKET_TO_BLKNO(metap, i); buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); MarkBufferDirty(buf); if (use_wal) log_newpage(&rel->rd_node, forkNum, blkno, BufferGetPage(buf), true); _hash_relbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* * Initialize bitmap page */ bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; metap->hashm_nmaps++; MarkBufferDirty(metabuf); /* XLOG stuff */ if (use_wal) { xl_hash_init_bitmap_page xlrec; XLogRecPtr recptr; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); /* * This is safe only because nobody else can be modifying the index at * this stage; it's only visible to the transaction that is creating * it. */ XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); PageSetLSN(BufferGetPage(bitmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } /* all done */ _hash_relbuf(rel, bitmapbuf); _hash_relbuf(rel, metabuf); return num_buckets; }
static void ReadBlocks(int filenum) { FILE *file; char record_type; char *dbname; Oid record_filenode; ForkNumber record_forknum; BlockNumber record_blocknum; BlockNumber record_range; int log_level = DEBUG3; Oid relOid = InvalidOid; Relation rel = NULL; bool skip_relation = false; bool skip_fork = false; bool skip_block = false; BlockNumber nblocks = 0; BlockNumber blocks_restored = 0; const char *filepath; /* * If this condition changes, then this code, and the code in the writer * will need to be changed; especially the format specifiers in log and * error messages. */ StaticAssertStmt(MaxBlockNumber == 0xFFFFFFFE, "Code may need review."); filepath = getSavefileName(filenum); file = fileOpen(filepath, PG_BINARY_R); dbname = readDBName(file, filepath); /* * When restoring global objects, the dbname is zero-length string, and non- * zero length otherwise. And filenum is never expected to be smaller than 1. */ Assert(filenum >= 1); Assert(filenum == 1 ? strlen(dbname) == 0 : strlen(dbname) > 0); /* To restore the global objects, use default database */ BackgroundWorkerInitializeConnection(filenum == 1 ? guc_default_database : dbname, NULL); SetCurrentStatementStartTimestamp(); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); pgstat_report_activity(STATE_RUNNING, "restoring buffers"); /* * Note that in case of a read error, we will leak relcache entry that we may * currently have open. In case of EOF, we close the relation after the loop. */ while (fileRead(&record_type, 1, file, true, filepath)) { /* * If we want to process the signals, this seems to be the best place * to do it. Generally the backends refrain from processing config file * while in transaction, but that's more for the fear of allowing GUC * changes to affect expression evaluation, causing different results * for the same expression in a transaction. Since this worker is not * processing any queries, it is okay to process the config file here. * * Even though it's okay to process SIGHUP here, doing so doesn't add * any value. The only reason we might want to process config file here * would be to allow the user to interrupt the BlockReader's operation * by changing this extenstion's GUC parameter. But the user can do that * anyway, using SIGTERM or pg_terminate_backend(). */ /* Stop processing the save-file if the Postmaster wants us to die. */ if (got_sigterm) break; ereport(log_level, (errmsg("record type %x - %c", record_type, record_type))); switch (record_type) { case 'r': { /* Close the previous relation, if any. */ if (rel) { relation_close(rel, AccessShareLock); rel = NULL; } record_forknum = InvalidForkNumber; record_blocknum = InvalidBlockNumber; nblocks = 0; fileRead(&record_filenode, sizeof(Oid), file, false, filepath); relOid = GetRelOid(record_filenode); ereport(log_level, (errmsg("processing filenode %u, relation %u", record_filenode, relOid))); /* * If the relation has been rewritten/dropped since we saved it, * just skip it and process the next relation. */ if (relOid == InvalidOid) skip_relation = true; else { skip_relation = false; /* Open the relation */ rel = relation_open(relOid, AccessShareLock); RelationOpenSmgr(rel); } } break; case 'f': { record_blocknum = InvalidBlockNumber; nblocks = 0; fileRead(&record_forknum, sizeof(ForkNumber), file, false, filepath); if (skip_relation) continue; if (rel == NULL) ereport(ERROR, (errmsg("found a fork record without a preceeding relation record"))); ereport(log_level, (errmsg("processing fork %d", record_forknum))); if (!smgrexists(rel->rd_smgr, record_forknum)) skip_fork = true; else { skip_fork = false; nblocks = RelationGetNumberOfBlocksInFork(rel, record_forknum); } } break; case 'b': { if (record_forknum == InvalidForkNumber) ereport(ERROR, (errmsg("found a block record without a preceeding fork record"))); fileRead(&record_blocknum, sizeof(BlockNumber), file, false, filepath); if (skip_relation || skip_fork) continue; /* * Don't try to read past the file; the file may have been shrunk * by a vaccum/truncate operation. */ if (record_blocknum >= nblocks) { ereport(log_level, (errmsg("reader %d skipping block filenode %u forknum %d blocknum %u", filenum, record_filenode, record_forknum, record_blocknum))); skip_block = true; continue; } else { Buffer buf; skip_block = false; ereport(log_level, (errmsg("reader %d reading block filenode %u forknum %d blocknum %u", filenum, record_filenode, record_forknum, record_blocknum))); buf = ReadBufferExtended(rel, record_forknum, record_blocknum, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_restored; } } break; case 'N': { BlockNumber block; Assert(record_blocknum != InvalidBlockNumber); if (record_blocknum == InvalidBlockNumber) ereport(ERROR, (errmsg("found a block range record without a preceeding block record"))); fileRead(&record_range, sizeof(int), file, false, filepath); if (skip_relation || skip_fork || skip_block) continue; ereport(log_level, (errmsg("reader %d reading range filenode %u forknum %d blocknum %u range %u", filenum, record_filenode, record_forknum, record_blocknum, record_range))); for (block = record_blocknum + 1; block <= (record_blocknum + record_range); ++block) { Buffer buf; /* * Don't try to read past the file; the file may have been * shrunk by a vaccum operation. */ if (block >= nblocks) { ereport(log_level, (errmsg("reader %d skipping block range filenode %u forknum %d start %u end %u", filenum, record_filenode, record_forknum, block, record_blocknum + record_range))); break; } buf = ReadBufferExtended(rel, record_forknum, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_restored; } } break; default: { ereport(ERROR, (errmsg("found unexpected save-file marker %x - %c)", record_type, record_type))); Assert(false); } break; } } if (rel) relation_close(rel, AccessShareLock); ereport(LOG, (errmsg("Block Reader %d: restored %u blocks", filenum, blocks_restored))); SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); pgstat_report_activity(STATE_IDLE, NULL); fileClose(file, filepath); /* Remove the save-file */ if (remove(filepath) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("error removing file \"%s\" : %m", filepath))); }