/* * SetMatViewToPopulated * Indicate that the materialized view has been populated by its query. * * NOTE: The heap starts out in a state that doesn't look scannable, and can * only transition from there to scannable at the time a new heap is created. * * NOTE: caller must be holding an appropriate lock on the relation. */ void SetMatViewToPopulated(Relation relation) { Page page; Assert(relation->rd_rel->relkind == RELKIND_MATVIEW); Assert(relation->rd_ispopulated == false); page = (Page) palloc(BLCKSZ); PageInit(page, BLCKSZ, 0); if (RelationNeedsWAL(relation)) log_newpage(&(relation->rd_node), MAIN_FORKNUM, 0, page); RelationOpenSmgr(relation); PageSetChecksumInplace(page, 0); smgrextend(relation->rd_smgr, MAIN_FORKNUM, 0, (char *) page, true); pfree(page); smgrimmedsync(relation->rd_smgr, MAIN_FORKNUM); RelationCacheInvalidateEntry(relation->rd_id); }
/* * Remove the visibility map fork for a relation. If there turn out to be * any bugs in the visibility map code that require rebuilding the VM, this * provides users with a way to do it that is cleaner than shutting down the * server and removing files by hand. * * This is a cut-down version of RelationTruncate. */ Datum pg_truncate_visibility_map(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); Relation rel; rel = relation_open(relid, AccessExclusiveLock); if (rel->rd_rel->relkind != RELKIND_RELATION && rel->rd_rel->relkind != RELKIND_MATVIEW && rel->rd_rel->relkind != RELKIND_TOASTVALUE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table, materialized view, or TOAST table", RelationGetRelationName(rel)))); RelationOpenSmgr(rel); rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber; visibilitymap_truncate(rel, 0); if (RelationNeedsWAL(rel)) { xl_smgr_truncate xlrec; xlrec.blkno = 0; xlrec.rnode = rel->rd_node; xlrec.flags = SMGR_TRUNCATE_VM; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); } /* * Release the lock right away, not at commit time. * * It would be a problem to release the lock prior to commit if this * truncate operation sends any transactional invalidation messages. Other * backends would potentially be able to lock the relation without * processing them in the window of time between when we release the lock * here and when we sent the messages at our eventual commit. However, * we're currently only sending a non-transactional smgr invalidation, * which will have been posted to shared memory immediately from within * visibilitymap_truncate. Therefore, there should be no race here. * * The reason why it's desirable to release the lock early here is because * of the possibility that someone will need to use this to blow away many * visibility map forks at once. If we can't release the lock until * commit time, the transaction doing this will accumulate * AccessExclusiveLocks on all of those relations at the same time, which * is undesirable. However, if this turns out to be unsafe we may have no * choice... */ relation_close(rel, AccessExclusiveLock); /* Nothing to return. */ PG_RETURN_VOID(); }
/* * Ensure that the visibility map fork is at least vm_nblocks long, extending * it if necessary with zeroed pages. */ static void vm_extend(Relation rel, BlockNumber vm_nblocks) { BlockNumber vm_nblocks_now; Page pg; pg = (Page) palloc(BLCKSZ); PageInit(pg, BLCKSZ, 0); /* * We use the relation extension lock to lock out other backends trying to * extend the visibility map at the same time. It also locks out extension * of the main fork, unnecessarily, but extending the visibility map * happens seldom enough that it doesn't seem worthwhile to have a * separate lock tag type for it. * * Note that another backend might have extended or created the relation * by the time we get the lock. */ LockRelationForExtension(rel, ExclusiveLock); /* Might have to re-open if a cache flush happened */ RelationOpenSmgr(rel); /* * Create the file first if it doesn't exist. If smgr_vm_nblocks is * positive then it must exist, no need for an smgrexists call. */ if ((rel->rd_smgr->smgr_vm_nblocks == 0 || rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) && !smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false); vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); /* Now extend the file */ while (vm_nblocks_now < vm_nblocks) { smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, (char *) pg, false); vm_nblocks_now++; } /* * Send a shared-inval message to force other backends to close any smgr * references they may have for this rel, which we are about to change. * This is a useful optimization because it means that backends don't have * to keep checking for creation or extension of the file, which happens * infrequently. */ CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode); /* Update local cache with the up-to-date size */ rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now; UnlockRelationForExtension(rel, ExclusiveLock); pfree(pg); }
/* * FreeSpaceMapTruncateRel - adjust for truncation of a relation. * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the FSM again. * * nblocks is the new___ size of the heap. */ void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) { BlockNumber new_nfsmblocks; FSMAddress first_removed_address; uint16 first_removed_slot; Buffer buf; RelationOpenSmgr(rel); /* * If no FSM has been created yet for this relation, there's nothing to * truncate. */ if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) return; /* Get the location in the FSM of the first removed heap block */ first_removed_address = fsm_get_location(nblocks, &first_removed_slot); /* * Zero out the tail of the last remaining FSM page. If the slot * representing the first removed heap block is at a page boundary, as the * first slot on the FSM page that first_removed_address points to, we can * just truncate that page altogether. */ if (first_removed_slot > 0) { buf = fsm_readbuf(rel, first_removed_address, false); if (!BufferIsValid(buf)) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); MarkBufferDirtyHint(buf, false); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; } else { new_nfsmblocks = fsm_logical_to_physical(first_removed_address); if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks) return; /* nothing to do; the FSM was already smaller */ } /* Truncate the unused FSM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks); /* * We might as well update the local smgr_fsm_nblocks setting. * smgrtruncate sent an smgr cache inval message, which will cause other * backends to invalidate their copy of smgr_fsm_nblocks, and this one too * at the next command boundary. But this ensures it isn't outright wrong * until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; }
/* * emit a completed btree page, and release the working storage. */ static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) { /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ RelationOpenSmgr(wstate->index); /* XLOG stuff */ if (wstate->btws_use_wal) { /* We use the heap NEWPAGE record type for this */ log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page); } else { /* Leave the page LSN zero if not WAL-logged, but set TLI anyway */ PageSetTLI(page, ThisTimeLineID); } /* * If we have to write pages nonsequentially, fill in the space with * zeroes until we come back and overwrite. This is not logically * necessary on standard Unix filesystems (unwritten space will read as * zeroes anyway), but it should help to avoid fragmentation. The dummy * pages aren't WAL-logged though. */ while (blkno > wstate->btws_pages_written) { if (!wstate->btws_zeropage) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, true); } /* * Now write the page. We say isTemp = true even if it's not a temp * index, because there's no need for smgr to schedule an fsync for this * write; we'll do it ourselves before ending the build. */ if (blkno == wstate->btws_pages_written) { /* extending the file... */ smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, (char *) page, true); wstate->btws_pages_written++; } else { /* overwriting a block we zero-filled before */ smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, (char *) page, true); } pfree(page); }
/* * End a rewrite. * * state and any other resources are freed. */ void end_heap_rewrite(RewriteState state) { HASH_SEQ_STATUS seq_status; UnresolvedTup unresolved; /* * Write any remaining tuples in the UnresolvedTups table. If we have any * left, they should in fact be dead, but let's err on the safe side. */ hash_seq_init(&seq_status, state->rs_unresolved_tups); while ((unresolved = hash_seq_search(&seq_status)) != NULL) { ItemPointerSetInvalid(&unresolved->tuple->t_data->t_ctid); raw_heap_insert(state, unresolved->tuple); } /* Write the last page, if any */ if (state->rs_buffer_valid) { if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, state->rs_buffer, true); RelationOpenSmgr(state->rs_new_rel); PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno, (char *) state->rs_buffer, true); } /* * If the rel is WAL-logged, must fsync before commit. We use heap_sync * to ensure that the toast table gets fsync'd too. * * It's obvious that we must do this when not WAL-logging. It's less * obvious that we have to do it even if we did WAL-log the pages. The * reason is the same as in tablecmds.c's copy_relation_data(): we're * writing data that's not in shared buffers, and so a CHECKPOINT * occurring during the rewriteheap operation won't have fsync'd data we * wrote before the checkpoint. */ if (RelationNeedsWAL(state->rs_new_rel)) heap_sync(state->rs_new_rel); /* Deleting the context frees everything */ MemoryContextDelete(state->rs_cxt); }
/* * Ensure that the FSM fork is at least fsm_nblocks long, extending * it if necessary with empty pages. And by empty, I mean pages filled * with zeros, meaning there's no free space. */ static void fsm_extend(Relation rel, BlockNumber fsm_nblocks) { BlockNumber fsm_nblocks_now; Page pg; pg = (Page) palloc(BLCKSZ); PageInit(pg, BLCKSZ, 0); /* * We use the relation extension lock to lock out other backends trying to * extend the FSM at the same time. It also locks out extension of the * main fork, unnecessarily, but extending the FSM happens seldom enough * that it doesn't seem worthwhile to have a separate lock tag type for * it. * * Note that another backend might have extended or created the relation * by the time we get the lock. */ LockRelationForExtension(rel, ExclusiveLock); /* Might have to re-open if a cache flush happened */ RelationOpenSmgr(rel); /* * Create the FSM file first if it doesn't exist. If smgr_fsm_nblocks is * positive then it must exist, no need for an smgrexists call. */ if ((rel->rd_smgr->smgr_fsm_nblocks == 0 || rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber) && !smgrexists(rel->rd_smgr, FSM_FORKNUM)) smgrcreate(rel->rd_smgr, FSM_FORKNUM, false); fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); while (fsm_nblocks_now < fsm_nblocks) { PageSetChecksumInplace(pg, fsm_nblocks_now); smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now, (char *) pg, false); fsm_nblocks_now++; } /* Update local cache with the up-to-date size */ rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now; UnlockRelationForExtension(rel, ExclusiveLock); pfree(pg); }
/* * For heaps, we prevent creation of the FSM unless the number of pages * exceeds HEAP_FSM_CREATION_THRESHOLD. For tables that don't already have * a FSM, this will save an inode and a few kB of space. * * XXX The API is a little awkward -- if the caller passes a valid nblocks * value, it can avoid invoking a system call. If the caller passes * InvalidBlockNumber and receives a false return value, it can get an * up-to-date relation size from get_nblocks. This saves a few cycles in * the caller, which would otherwise need to get the relation size by itself. */ static bool fsm_allow_writes(Relation rel, BlockNumber heapblk, BlockNumber nblocks, BlockNumber *get_nblocks) { bool skip_get_nblocks; if (heapblk >= HEAP_FSM_CREATION_THRESHOLD) return true; /* Non-heap rels can always create a FSM. */ if (rel->rd_rel->relkind != RELKIND_RELATION && rel->rd_rel->relkind != RELKIND_TOASTVALUE) return true; /* * If the caller knows nblocks, we can avoid a system call later. If it * doesn't, maybe we have relpages from a previous VACUUM. Since the table * may have extended since then, we still have to count the pages later if * we can't return now. */ if (nblocks != InvalidBlockNumber) { if (nblocks > HEAP_FSM_CREATION_THRESHOLD) return true; else skip_get_nblocks = true; } else { if (rel->rd_rel->relpages != InvalidBlockNumber && rel->rd_rel->relpages > HEAP_FSM_CREATION_THRESHOLD) return true; else skip_get_nblocks = false; } RelationOpenSmgr(rel); if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) return true; if (skip_get_nblocks) return false; /* last resort */ *get_nblocks = RelationGetNumberOfBlocks(rel); if (*get_nblocks > HEAP_FSM_CREATION_THRESHOLD) return true; else return false; }
/* * Read a visibility map page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the visibility map file is extended. */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend) { Buffer buf; /* * We might not have opened the relation at the smgr level yet, or we * might have been forced to close it by a sinval message. The code below * won't necessarily notice relation extension immediately when extend = * false, so we rely on sinval messages to ensure that our ideas about the * size of the map aren't too far out of date. */ RelationOpenSmgr(rel); /* * If we haven't cached the size of the visibility map fork yet, check it * first. */ if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) { if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); else rel->rd_smgr->smgr_vm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_vm_nblocks) { if (extend) vm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's * always safe to clear bits, so it's better to clear corrupt pages than * error out. */ buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); return buf; }
/* * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages * * This does not need to initialize the new bucket pages; we'll do that as * each one is used by _hash_expandtable(). But we have to extend the logical * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in * sync with ours, so that we don't get complaints from smgr. * * We do this by writing a page of zeroes at the end of the splitpoint range. * We expect that the filesystem will ensure that the intervening pages read * as zeroes too. On many filesystems this "hole" will not be allocated * immediately, which means that the index file may end up more fragmented * than if we forced it all to be allocated now; but since we don't scan * hash indexes sequentially anyway, that probably doesn't matter. * * XXX It's annoying that this code is executed with the metapage lock held. * We need to interlock against _hash_addovflpage() adding a new overflow page * concurrently, but it'd likely be better to use LockRelationForExtension * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, * so it may not be worth worrying about. * * Returns TRUE if successful, or FALSE if allocation failed due to * BlockNumber overflow. */ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; char zerobuf[BLCKSZ]; Page page; HashPageOpaque ovflopaque; lastblock = firstblock + nblocks - 1; /* * Check for overflow in block number calculation; if so, we cannot extend * the index anymore. */ if (lastblock < firstblock || lastblock == InvalidBlockNumber) return false; page = (Page) zerobuf; /* * Initialize the page. Just zeroing the page won't work; see * _hash_freeovflpage for similar usage. We take care to make the special * space valid for the benefit of tools such as pageinspect. */ _hash_pageinit(page, BLCKSZ); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page); ovflopaque->hasho_prevblkno = InvalidBlockNumber; ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = -1; ovflopaque->hasho_flag = LH_UNUSED_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; if (RelationNeedsWAL(rel)) log_newpage(&rel->rd_node, MAIN_FORKNUM, lastblock, zerobuf, true); RelationOpenSmgr(rel); smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); return true; }
/* * Read a FSM page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the FSM file is extended. */ static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend) { BlockNumber blkno = fsm_logical_to_physical(addr); Buffer buf; RelationOpenSmgr(rel); /* * If we haven't cached the size of the FSM yet, check it first. Also * recheck if the requested block seems to be past end, since our cached * value might be stale. (We send smgr inval messages on truncation, but * not on extension.) */ if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber || blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); else rel->rd_smgr->smgr_fsm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (extend) fsm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM * information is not accurate anyway, so it's better to clear corrupt * pages than error out. Since the FSM changes are not WAL-logged, the * so-called torn page problem on crash can lead to pages with corrupt * headers, for example. */ buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); return buf; }
/* * Read a visibility map page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the visibility map file is extended. */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend) { Buffer buf; RelationOpenSmgr(rel); /* * If we haven't cached the size of the visibility map fork yet, check it * first. Also recheck if the requested block seems to be past end, since * our cached value might be stale. (We send smgr inval messages on * truncation, but not on extension.) */ if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber || blkno >= rel->rd_smgr->smgr_vm_nblocks) { if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); else rel->rd_smgr->smgr_vm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_vm_nblocks) { if (extend) vm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's * always safe to clear bits, so it's better to clear corrupt pages than * error out. */ buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); return buf; }
/* * Read a visibility map page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the visibility map file is extended. */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend) { Buffer buf; RelationOpenSmgr(rel); /* * The current size of the visibility map fork is kept in relcache, to * avoid reading beyond EOF. If we haven't cached the size of the map yet, * do that first. */ if (rel->rd_vm_nblocks == InvalidBlockNumber) { if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) rel->rd_vm_nblocks = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); else rel->rd_vm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_vm_nblocks) { if (extend) vm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's * always safe to clear bits, so it's better to clear corrupt pages than * error out. */ buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); return buf; }
/* * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages * * This does not need to initialize the new bucket pages; we'll do that as * each one is used by _hash_expandtable(). But we have to extend the logical * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in * sync with ours, so that we don't get complaints from smgr. * * We do this by writing a page of zeroes at the end of the splitpoint range. * We expect that the filesystem will ensure that the intervening pages read * as zeroes too. On many filesystems this "hole" will not be allocated * immediately, which means that the index file may end up more fragmented * than if we forced it all to be allocated now; but since we don't scan * hash indexes sequentially anyway, that probably doesn't matter. * * XXX It's annoying that this code is executed with the metapage lock held. * We need to interlock against _hash_getovflpage() adding a new overflow page * concurrently, but it'd likely be better to use LockRelationForExtension * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, * so it may not be worth worrying about. * * Returns TRUE if successful, or FALSE if allocation failed due to * BlockNumber overflow. */ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; char zerobuf[BLCKSZ]; lastblock = firstblock + nblocks - 1; /* * Check for overflow in block number calculation; if so, we cannot extend * the index anymore. */ if (lastblock < firstblock || lastblock == InvalidBlockNumber) return false; MemSet(zerobuf, 0, sizeof(zerobuf)); RelationOpenSmgr(rel); smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); return true; }
/* * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages * * This does not need to initialize the new bucket pages; we'll do that as * each one is used by _hash_expandtable(). But we have to extend the logical * EOF to the end of the splitpoint; otherwise the first overflow page * allocated beyond the splitpoint will represent a noncontiguous access, * which can confuse md.c (and will probably be forbidden by future changes * to md.c). * * We do this by writing a page of zeroes at the end of the splitpoint range. * We expect that the filesystem will ensure that the intervening pages read * as zeroes too. On many filesystems this "hole" will not be allocated * immediately, which means that the index file may end up more fragmented * than if we forced it all to be allocated now; but since we don't scan * hash indexes sequentially anyway, that probably doesn't matter. * * XXX It's annoying that this code is executed with the metapage lock held. * We need to interlock against _hash_getovflpage() adding a new overflow page * concurrently, but it'd likely be better to use LockRelationForExtension * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, * so it may not be worth worrying about. * * Returns TRUE if successful, or FALSE if allocation failed due to * BlockNumber overflow. */ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; BlockNumber endblock; char zerobuf[BLCKSZ]; lastblock = firstblock + nblocks - 1; /* * Check for overflow in block number calculation; if so, we cannot * extend the index anymore. */ if (lastblock < firstblock || lastblock == InvalidBlockNumber) return false; MemSet(zerobuf, 0, sizeof(zerobuf)); RelationOpenSmgr(rel); /* * XXX If the extension results in creation of new segment files, * we have to make sure that each non-last file is correctly filled out to * RELSEG_SIZE blocks. This ought to be done inside mdextend, but * changing the smgr API seems best left for development cycle not late * beta. Temporary fix for bug #2737. */ #ifndef LET_OS_MANAGE_FILESIZE for (endblock = firstblock | (RELSEG_SIZE - 1); endblock < lastblock; endblock += RELSEG_SIZE) smgrextend(rel->rd_smgr, endblock, zerobuf, rel->rd_istemp); #endif smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp); return true; }
static void ReadBlocks(int filenum) { FILE *file; char record_type; char *dbname; Oid record_filenode; ForkNumber record_forknum; BlockNumber record_blocknum; BlockNumber record_range; int log_level = DEBUG3; Oid relOid = InvalidOid; Relation rel = NULL; bool skip_relation = false; bool skip_fork = false; bool skip_block = false; BlockNumber nblocks = 0; BlockNumber blocks_restored = 0; const char *filepath; /* * If this condition changes, then this code, and the code in the writer * will need to be changed; especially the format specifiers in log and * error messages. */ StaticAssertStmt(MaxBlockNumber == 0xFFFFFFFE, "Code may need review."); filepath = getSavefileName(filenum); file = fileOpen(filepath, PG_BINARY_R); dbname = readDBName(file, filepath); /* * When restoring global objects, the dbname is zero-length string, and non- * zero length otherwise. And filenum is never expected to be smaller than 1. */ Assert(filenum >= 1); Assert(filenum == 1 ? strlen(dbname) == 0 : strlen(dbname) > 0); /* To restore the global objects, use default database */ BackgroundWorkerInitializeConnection(filenum == 1 ? guc_default_database : dbname, NULL); SetCurrentStatementStartTimestamp(); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); pgstat_report_activity(STATE_RUNNING, "restoring buffers"); /* * Note that in case of a read error, we will leak relcache entry that we may * currently have open. In case of EOF, we close the relation after the loop. */ while (fileRead(&record_type, 1, file, true, filepath)) { /* * If we want to process the signals, this seems to be the best place * to do it. Generally the backends refrain from processing config file * while in transaction, but that's more for the fear of allowing GUC * changes to affect expression evaluation, causing different results * for the same expression in a transaction. Since this worker is not * processing any queries, it is okay to process the config file here. * * Even though it's okay to process SIGHUP here, doing so doesn't add * any value. The only reason we might want to process config file here * would be to allow the user to interrupt the BlockReader's operation * by changing this extenstion's GUC parameter. But the user can do that * anyway, using SIGTERM or pg_terminate_backend(). */ /* Stop processing the save-file if the Postmaster wants us to die. */ if (got_sigterm) break; ereport(log_level, (errmsg("record type %x - %c", record_type, record_type))); switch (record_type) { case 'r': { /* Close the previous relation, if any. */ if (rel) { relation_close(rel, AccessShareLock); rel = NULL; } record_forknum = InvalidForkNumber; record_blocknum = InvalidBlockNumber; nblocks = 0; fileRead(&record_filenode, sizeof(Oid), file, false, filepath); relOid = GetRelOid(record_filenode); ereport(log_level, (errmsg("processing filenode %u, relation %u", record_filenode, relOid))); /* * If the relation has been rewritten/dropped since we saved it, * just skip it and process the next relation. */ if (relOid == InvalidOid) skip_relation = true; else { skip_relation = false; /* Open the relation */ rel = relation_open(relOid, AccessShareLock); RelationOpenSmgr(rel); } } break; case 'f': { record_blocknum = InvalidBlockNumber; nblocks = 0; fileRead(&record_forknum, sizeof(ForkNumber), file, false, filepath); if (skip_relation) continue; if (rel == NULL) ereport(ERROR, (errmsg("found a fork record without a preceeding relation record"))); ereport(log_level, (errmsg("processing fork %d", record_forknum))); if (!smgrexists(rel->rd_smgr, record_forknum)) skip_fork = true; else { skip_fork = false; nblocks = RelationGetNumberOfBlocksInFork(rel, record_forknum); } } break; case 'b': { if (record_forknum == InvalidForkNumber) ereport(ERROR, (errmsg("found a block record without a preceeding fork record"))); fileRead(&record_blocknum, sizeof(BlockNumber), file, false, filepath); if (skip_relation || skip_fork) continue; /* * Don't try to read past the file; the file may have been shrunk * by a vaccum/truncate operation. */ if (record_blocknum >= nblocks) { ereport(log_level, (errmsg("reader %d skipping block filenode %u forknum %d blocknum %u", filenum, record_filenode, record_forknum, record_blocknum))); skip_block = true; continue; } else { Buffer buf; skip_block = false; ereport(log_level, (errmsg("reader %d reading block filenode %u forknum %d blocknum %u", filenum, record_filenode, record_forknum, record_blocknum))); buf = ReadBufferExtended(rel, record_forknum, record_blocknum, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_restored; } } break; case 'N': { BlockNumber block; Assert(record_blocknum != InvalidBlockNumber); if (record_blocknum == InvalidBlockNumber) ereport(ERROR, (errmsg("found a block range record without a preceeding block record"))); fileRead(&record_range, sizeof(int), file, false, filepath); if (skip_relation || skip_fork || skip_block) continue; ereport(log_level, (errmsg("reader %d reading range filenode %u forknum %d blocknum %u range %u", filenum, record_filenode, record_forknum, record_blocknum, record_range))); for (block = record_blocknum + 1; block <= (record_blocknum + record_range); ++block) { Buffer buf; /* * Don't try to read past the file; the file may have been * shrunk by a vaccum operation. */ if (block >= nblocks) { ereport(log_level, (errmsg("reader %d skipping block range filenode %u forknum %d start %u end %u", filenum, record_filenode, record_forknum, block, record_blocknum + record_range))); break; } buf = ReadBufferExtended(rel, record_forknum, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_restored; } } break; default: { ereport(ERROR, (errmsg("found unexpected save-file marker %x - %c)", record_type, record_type))); Assert(false); } break; } } if (rel) relation_close(rel, AccessShareLock); ereport(LOG, (errmsg("Block Reader %d: restored %u blocks", filenum, blocks_restored))); SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); pgstat_report_activity(STATE_IDLE, NULL); fileClose(file, filepath); /* Remove the save-file */ if (remove(filepath) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("error removing file \"%s\" : %m", filepath))); }
/* * visibilitymap_truncate - truncate the visibility map * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the VM again. * * nheapblocks is the new size of the heap. */ void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) { BlockNumber newnblocks; /* last remaining block, byte, and bit */ BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks); #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); #endif RelationOpenSmgr(rel); /* * If no visibility map has been created yet for this relation, there's * nothing to truncate. */ if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) return; /* * Unless the new size is exactly at a visibility map page boundary, the * tail bits in the last remaining map page, representing truncated heap * blocks, need to be cleared. This is not only tidy, but also necessary * because we don't get a chance to clear the bits if the heap is extended * again. */ if (truncByte != 0 || truncBit != 0) { Buffer mapBuffer; Page page; char *map; newnblocks = truncBlock + 1; mapBuffer = vm_readbuf(rel, truncBlock, false); if (!BufferIsValid(mapBuffer)) { /* nothing to do, the file was already smaller */ return; } page = BufferGetPage(mapBuffer); map = PageGetContents(page); LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); /* Clear out the unwanted bytes. */ MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); /* * Mask out the unwanted bits of the last remaining byte. * * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) - * 1) = 00111111 ((1 << 7) - 1) = 01111111 */ map[truncByte] &= (1 << truncBit) - 1; MarkBufferDirty(mapBuffer); UnlockReleaseBuffer(mapBuffer); } else newnblocks = truncBlock; if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks) { /* nothing to do, the file was already smaller than requested size */ return; } /* Truncate the unused VM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks); /* * We might as well update the local smgr_vm_nblocks setting. smgrtruncate * sent an smgr cache inval message, which will cause other backends to * invalidate their copy of smgr_vm_nblocks, and this one too at the next * command boundary. But this ensures it isn't outright wrong until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_vm_nblocks = newnblocks; }
/* * Read tuples in correct sort order from tuplesort, and load them into * btree leaves. */ static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { BTPageState *state = NULL; bool merge = (btspool2 != NULL); IndexTuple itup, itup2 = NULL; bool load1; TupleDesc tupdes = RelationGetDescr(wstate->index); int i, keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey = NULL; SortSupport sortKeys; if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge * btspool and btspool2. */ /* the preparation of merge */ itup = tuplesort_getindextuple(btspool->sortstate, true); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); indexScanKey = _bt_mkscankey_nodata(wstate->index); /* Prepare SortSupport data for each column */ sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); for (i = 0; i < keysz; i++) { SortSupport sortKey = sortKeys + i; ScanKey scanKey = indexScanKey + i; int16 strategy; sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = scanKey->sk_collation; sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; /* Abbreviation is not supported here */ sortKey->abbreviate = false; AssertState(sortKey->ssup_attno != 0); strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? BTGreaterStrategyNumber : BTLessStrategyNumber; PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); } _bt_freeskey(indexScanKey); for (;;) { load1 = true; /* load BTSpool next ? */ if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { for (i = 1; i <= keysz; i++) { SortSupport entry; Datum attrDatum1, attrDatum2; bool isNull1, isNull2; int32 compare; entry = sortKeys + i - 1; attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); compare = ApplySortComparator(attrDatum1, isNull1, attrDatum2, isNull2, entry); if (compare > 0) { load1 = false; break; } else if (compare < 0) break; } } else load1 = false; /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { _bt_buildadd(wstate, state, itup); itup = tuplesort_getindextuple(btspool->sortstate, true); } else { _bt_buildadd(wstate, state, itup2); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); } } pfree(sortKeys); } else { /* merge is unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) { /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); _bt_buildadd(wstate, state, itup); } } /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index is WAL-logged, we must fsync it down to disk before it's * safe to commit the transaction. (For a non-WAL-logged index we don't * care since the index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (RelationNeedsWAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } }
/* * Insert a tuple to the new relation. This has to track heap_insert * and its subsidiary functions! * * t_self of the tuple is set to the new TID of the tuple. If t_ctid of the * tuple is invalid on entry, it's replaced with the new TID as well (in * the inserted data only, not in the caller's copy). */ static void raw_heap_insert(RewriteState state, HeapTuple tup) { Page page = state->rs_buffer; Size pageFreeSpace, saveFreeSpace; Size len; OffsetNumber newoff; HeapTuple heaptup; /* * If the new tuple is too big for storage or contains already toasted * out-of-line attributes from some other relation, invoke the toaster. * * Note: below this point, heaptup is the data we actually intend to store * into the relation; tup is the caller's original untoasted data. */ if (state->rs_new_rel->rd_rel->relkind == RELKIND_TOASTVALUE) { /* toast table entries should never be recursively toasted */ Assert(!HeapTupleHasExternal(tup)); heaptup = tup; } else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) heaptup = toast_insert_or_update(state->rs_new_rel, tup, NULL, HEAP_INSERT_SKIP_FSM | (state->rs_use_wal ? 0 : HEAP_INSERT_SKIP_WAL)); else heaptup = tup; len = MAXALIGN(heaptup->t_len); /* be conservative */ /* * If we're gonna fail for oversize tuple, do it right away */ if (len > MaxHeapTupleSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %zu, maximum size %zu", len, MaxHeapTupleSize))); /* Compute desired extra freespace due to fillfactor option */ saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel, HEAP_DEFAULT_FILLFACTOR); /* Now we can check to see if there's enough free space already. */ if (state->rs_buffer_valid) { pageFreeSpace = PageGetHeapFreeSpace(page); if (len + saveFreeSpace > pageFreeSpace) { /* Doesn't fit, so write out the existing page */ /* XLOG stuff */ if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, page, true); /* * Now write the page. We say isTemp = true even if it's not a * temp table, because there's no need for smgr to schedule an * fsync for this write; we'll do it ourselves in * end_heap_rewrite. */ RelationOpenSmgr(state->rs_new_rel); PageSetChecksumInplace(page, state->rs_blockno); smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno, (char *) page, true); state->rs_blockno++; state->rs_buffer_valid = false; } } if (!state->rs_buffer_valid) { /* Initialize a new empty page */ PageInit(page, BLCKSZ, 0); state->rs_buffer_valid = true; } /* And now we can insert the tuple into the page */ newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add tuple"); /* Update caller's t_self to the actual position where it was stored */ ItemPointerSet(&(tup->t_self), state->rs_blockno, newoff); /* * Insert the correct position into CTID of the stored tuple, too, if the * caller didn't supply a valid CTID. */ if (!ItemPointerIsValid(&tup->t_data->t_ctid)) { ItemId newitemid; HeapTupleHeader onpage_tup; newitemid = PageGetItemId(page, newoff); onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); onpage_tup->t_ctid = tup->t_self; } /* If heaptup is a private copy, release it. */ if (heaptup != tup) heap_freetuple(heaptup); }
/* * emit a completed btree page, and release the working storage. */ static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) { // Fetch gp_persistent_relation_node information that will be added to XLOG record. RelationFetchGpRelationNodeForXLog(wstate->index); /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ RelationOpenSmgr(wstate->index); /* XLOG stuff */ if (wstate->btws_use_wal) { _bt_lognewpage(wstate->index, page, blkno); } else { /* Leave the page LSN zero if not WAL-logged, but set TLI anyway */ PageSetTLI(page, ThisTimeLineID); } /* * If we have to write pages nonsequentially, fill in the space with * zeroes until we come back and overwrite. This is not logically * necessary on standard Unix filesystems (unwritten space will read as * zeroes anyway), but it should help to avoid fragmentation. The dummy * pages aren't WAL-logged though. */ while (blkno > wstate->btws_pages_written) { if (!wstate->btws_zeropage) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); // -------- MirroredLock ---------- // UNDONE: Unfortunately, I think we write temp relations to the mirror... LWLockAcquire(MirroredLock, LW_SHARED); smgrextend(wstate->index->rd_smgr, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, true); LWLockRelease(MirroredLock); // -------- MirroredLock ---------- } // -------- MirroredLock ---------- // UNDONE: Unfortunately, I think we write temp relations to the mirror... LWLockAcquire(MirroredLock, LW_SHARED); /* * Now write the page. We say isTemp = true even if it's not a temp * index, because there's no need for smgr to schedule an fsync for this * write; we'll do it ourselves before ending the build. */ if (blkno == wstate->btws_pages_written) { /* extending the file... */ smgrextend(wstate->index->rd_smgr, blkno, (char *) page, true); wstate->btws_pages_written++; } else { /* overwriting a block we zero-filled before */ smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true); } LWLockRelease(MirroredLock); // -------- MirroredLock ---------- pfree(page); }
/* * Open a relation during XLOG replay * * Note: this once had an API that allowed NULL return on failure, but it * no longer does; any failure results in elog(). */ Relation XLogOpenRelation(RelFileNode rnode) { XLogRelDesc *res; XLogRelCacheEntry *hentry; bool found; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL); if (hentry) { res = hentry->rdesc; res->lessRecently->moreRecently = res->moreRecently; res->moreRecently->lessRecently = res->lessRecently; } else { res = _xl_new_reldesc(); sprintf(RelationGetRelationName(&(res->reldata)), "%u", rnode.relNode); res->reldata.rd_node = rnode; /* * We set up the lockRelId in case anything tries to lock the dummy * relation. Note that this is fairly bogus since relNode may be * different from the relation's OID. It shouldn't really matter * though, since we are presumably running by ourselves and can't have * any lock conflicts ... */ res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode; res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found); if (found) elog(PANIC, "xlog relation already present on insert into cache"); hentry->rdesc = res; res->reldata.rd_targblock = InvalidBlockNumber; res->reldata.rd_smgr = NULL; RelationOpenSmgr(&(res->reldata)); /* * Create the target file if it doesn't already exist. This lets us * cope if the replay sequence contains writes to a relation that is * later deleted. (The original coding of this routine would instead * return NULL, causing the writes to be suppressed. But that seems * like it risks losing valuable data if the filesystem loses an inode * during a crash. Better to write the data until we are actually * told to delete the file.) */ smgrcreate(res->reldata.rd_smgr, res->reldata.rd_istemp, true); } res->moreRecently = &(_xlrelarr[0]); res->lessRecently = _xlrelarr[0].lessRecently; _xlrelarr[0].lessRecently = res; res->lessRecently->moreRecently = res; return &(res->reldata); }
/* * FreeSpaceMapTruncateRel - adjust for truncation of a relation. * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the FSM again. * * nblocks is the new size of the heap. */ void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) { BlockNumber new_nfsmblocks; FSMAddress first_removed_address; uint16 first_removed_slot; Buffer buf; RelationOpenSmgr(rel); /* * If no FSM has been created yet for this relation, there's nothing to * truncate. */ if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) return; /* Get the location in the FSM of the first removed heap block */ first_removed_address = fsm_get_location(nblocks, &first_removed_slot); /* * Zero out the tail of the last remaining FSM page. If the slot * representing the first removed heap block is at a page boundary, as the * first slot on the FSM page that first_removed_address points to, we can * just truncate that page altogether. */ if (first_removed_slot > 0) { buf = fsm_readbuf(rel, first_removed_address, false); if (!BufferIsValid(buf)) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); /* * Truncation of a relation is WAL-logged at a higher-level, and we * will be called at WAL replay. But if checksums are enabled, we need * to still write a WAL record to protect against a torn page, if the * page is flushed to disk before the truncation WAL record. We cannot * use MarkBufferDirtyHint here, because that will not dirty the page * during recovery. */ MarkBufferDirty(buf); if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) log_newpage_buffer(buf, false); END_CRIT_SECTION(); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; } else { new_nfsmblocks = fsm_logical_to_physical(first_removed_address); if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks) return; /* nothing to do; the FSM was already smaller */ } /* Truncate the unused FSM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks); /* * We might as well update the local smgr_fsm_nblocks setting. * smgrtruncate sent an smgr cache inval message, which will cause other * backends to invalidate their copy of smgr_fsm_nblocks, and this one too * at the next command boundary. But this ensures it isn't outright wrong * until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; /* * Update upper-level FSM pages to account for the truncation. This is * important because the just-truncated pages were likely marked as * all-free, and would be preferentially selected. */ FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); }
/* * Read a FSM page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the FSM file is extended. */ static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend) { BlockNumber blkno = fsm_logical_to_physical(addr); Buffer buf; RelationOpenSmgr(rel); /* * If we haven't cached the size of the FSM yet, check it first. Also * recheck if the requested block seems to be past end, since our cached * value might be stale. (We send smgr inval messages on truncation, but * not on extension.) */ if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber || blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); else rel->rd_smgr->smgr_fsm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (extend) fsm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM * information is not accurate anyway, so it's better to clear corrupt * pages than error out. Since the FSM changes are not WAL-logged, the * so-called torn page problem on crash can lead to pages with corrupt * headers, for example. * * The initialize-the-page part is trickier than it looks, because of the * possibility of multiple backends doing this concurrently, and our * desire to not uselessly take the buffer lock in the normal path where * the page is OK. We must take the lock to initialize the page, so * recheck page newness after we have the lock, in case someone else * already did it. Also, because we initially check PageIsNew with no * lock, it's possible to fall through and return the buffer while someone * else is still initializing the page (i.e., we might see pd_upper as set * but other page header fields are still zeroes). This is harmless for * callers that will take a buffer lock themselves, but some callers * inspect the page without any lock at all. The latter is OK only so * long as it doesn't depend on the page header having correct contents. * Current usage is safe because PageGetContents() does not require that. */ buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } return buf; }
/* * _bt_mergeload - Merge two streams of index tuples into new index files. */ static void _bt_mergeload(Spooler *self, BTWriteState *wstate, BTSpool *btspool, BTReader *btspool2, Relation heapRel) { BTPageState *state = NULL; IndexTuple itup, itup2; bool should_free = false; TupleDesc tupdes = RelationGetDescr(wstate->index); int keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey; ON_DUPLICATE on_duplicate = self->on_duplicate; Assert(btspool != NULL); /* the preparation of merge */ itup = BTSpoolGetNextItem(btspool, NULL, &should_free); itup2 = BTReaderGetNextItem(btspool2); indexScanKey = _bt_mkscankey_nodata(wstate->index); for (;;) { bool load1 = true; /* load BTSpool next ? */ bool hasnull; int32 compare; if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { compare = compare_indextuple(itup, itup2, indexScanKey, keysz, tupdes, &hasnull); if (compare == 0 && !hasnull && btspool->isunique) { ItemPointerData t_tid2; /* * t_tid is update by heap_is_visible(), because use it for an * index, t_tid backup */ ItemPointerCopy(&itup2->t_tid, &t_tid2); /* The tuple pointed by the old index should not be visible. */ if (!heap_is_visible(heapRel, &itup->t_tid)) { itup = BTSpoolGetNextItem(btspool, itup, &should_free); } else if (!heap_is_visible(heapRel, &itup2->t_tid)) { itup2 = BTReaderGetNextItem(btspool2); } else { if (on_duplicate == ON_DUPLICATE_KEEP_NEW) { self->dup_old++; remove_duplicate(self, heapRel, itup2, RelationGetRelationName(wstate->index)); itup2 = BTReaderGetNextItem(btspool2); } else { ItemPointerCopy(&t_tid2, &itup2->t_tid); self->dup_new++; remove_duplicate(self, heapRel, itup, RelationGetRelationName(wstate->index)); itup = BTSpoolGetNextItem(btspool, itup, &should_free); } } continue; } else if (compare > 0) load1 = false; } else load1 = false; BULKLOAD_PROFILE(&prof_merge_unique); /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { IndexTuple next_itup = NULL; bool next_should_free = false; for (;;) { /* get next item */ next_itup = BTSpoolGetNextItem(btspool, next_itup, &next_should_free); if (!btspool->isunique || next_itup == NULL) break; compare = compare_indextuple(itup, next_itup, indexScanKey, keysz, tupdes, &hasnull); if (compare < 0 || hasnull) break; if (compare > 0) { /* shouldn't happen */ elog(ERROR, "faild in tuplesort_performsort"); } /* * If tupple is deleted by other unique indexes, not visible */ if (!heap_is_visible(heapRel, &next_itup->t_tid)) { continue; } if (!heap_is_visible(heapRel, &itup->t_tid)) { if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; next_should_free = false; continue; } /* not unique between input files */ self->dup_new++; remove_duplicate(self, heapRel, next_itup, RelationGetRelationName(wstate->index)); if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); } _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; } else { _bt_buildadd(wstate, state, itup2); itup2 = BTReaderGetNextItem(btspool2); } BULKLOAD_PROFILE(&prof_merge_insert); } _bt_freeskey(indexScanKey); /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index isn't temp, we must fsync it down to disk before it's safe * to commit the transaction. (For a temp index we don't care since the * index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (!RELATION_IS_LOCAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } BULKLOAD_PROFILE(&prof_merge_term); }
/* * Open a relation during XLOG replay * * Note: this once had an API that allowed NULL return on failure, but it * no longer does; any failure results in elog(). */ Relation XLogOpenRelation(RelFileNode rnode) { XLogRelDesc *res; XLogRelCacheEntry *hentry; bool found; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL); if (hentry) { res = hentry->rdesc; res->lessRecently->moreRecently = res->moreRecently; res->moreRecently->lessRecently = res->lessRecently; } else { /* * We need to fault in the database directory on the standby. */ if (rnode.spcNode != GLOBALTABLESPACE_OID && IsStandbyMode()) { char *primaryFilespaceLocation = NULL; char *dbPath; if (IsBuiltinTablespace(rnode.spcNode)) { /* * No filespace to fetch. */ } else { char *mirrorFilespaceLocation = NULL; /* * Investigate whether the containing directories exist to give more detail. */ PersistentTablespace_GetPrimaryAndMirrorFilespaces( rnode.spcNode, &primaryFilespaceLocation, &mirrorFilespaceLocation); if (primaryFilespaceLocation == NULL || strlen(primaryFilespaceLocation) == 0) { elog(ERROR, "Empty primary filespace directory location"); } if (mirrorFilespaceLocation != NULL) { pfree(mirrorFilespaceLocation); mirrorFilespaceLocation = NULL; } } dbPath = (char*)palloc(MAXPGPATH + 1); FormDatabasePath( dbPath, primaryFilespaceLocation, rnode.spcNode, rnode.dbNode); if (primaryFilespaceLocation != NULL) { pfree(primaryFilespaceLocation); primaryFilespaceLocation = NULL; } if (mkdir(dbPath, 0700) == 0) { if (Debug_persistent_recovery_print) { elog(PersistentRecovery_DebugPrintLevel(), "XLogOpenRelation: Re-created database directory \"%s\"", dbPath); } } else { /* * Allowed to already exist. */ if (errno != EEXIST) { elog(ERROR, "could not create database directory \"%s\": %m", dbPath); } else { if (Debug_persistent_recovery_print) { elog(PersistentRecovery_DebugPrintLevel(), "XLogOpenRelation: Database directory \"%s\" already exists", dbPath); } } } pfree(dbPath); } res = _xl_new_reldesc(); sprintf(RelationGetRelationName(&(res->reldata)), "%u", rnode.relNode); res->reldata.rd_node = rnode; /* * We set up the lockRelId in case anything tries to lock the dummy * relation. Note that this is fairly bogus since relNode may be * different from the relation's OID. It shouldn't really matter * though, since we are presumably running by ourselves and can't have * any lock conflicts ... */ res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode; res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found); if (found) elog(PANIC, "xlog relation already present on insert into cache"); hentry->rdesc = res; res->reldata.rd_targblock = InvalidBlockNumber; res->reldata.rd_smgr = NULL; RelationOpenSmgr(&(res->reldata)); /* * Create the target file if it doesn't already exist. This lets us * cope if the replay sequence contains writes to a relation that is * later deleted. (The original coding of this routine would instead * return NULL, causing the writes to be suppressed. But that seems * like it risks losing valuable data if the filesystem loses an inode * during a crash. Better to write the data until we are actually * told to delete the file.) */ // NOTE: We no longer re-create files automatically because // new FileRep persistent objects will ensure files exist. // UNDONE: Can't remove this block of code yet until boot time calls to this routine are analyzed... { MirrorDataLossTrackingState mirrorDataLossTrackingState; int64 mirrorDataLossTrackingSessionNum; bool mirrorDataLossOccurred; // UNDONE: What about the persistent rel files table??? // UNDONE: This condition should not occur anymore. // UNDONE: segmentFileNum and AO? mirrorDataLossTrackingState = FileRepPrimary_GetMirrorDataLossTrackingSessionNum( &mirrorDataLossTrackingSessionNum); smgrcreate( res->reldata.rd_smgr, res->reldata.rd_isLocalBuf, /* relationName */ NULL, // Ok to be NULL -- we don't know the name here. mirrorDataLossTrackingState, mirrorDataLossTrackingSessionNum, /* ignoreAlreadyExists */ true, &mirrorDataLossOccurred); } } res->moreRecently = &(_xlrelarr[0]); res->lessRecently = _xlrelarr[0].lessRecently; _xlrelarr[0].lessRecently = res; res->lessRecently->moreRecently = res; Assert(&(res->reldata) != NULL); // Assert what it says in the interface -- we don't return NULL anymore. return &(res->reldata); }
/* * RelationTruncate * Physically truncate a relation to the specified number of blocks. * * This includes getting rid of any buffers for the blocks that are to be * dropped. */ void RelationTruncate(Relation rel, BlockNumber nblocks) { bool fsm; bool vm; /* Open it at the smgr level if not already done */ RelationOpenSmgr(rel); /* * Make sure smgr_targblock etc aren't pointing somewhere past new end */ rel->rd_smgr->smgr_targblock = InvalidBlockNumber; rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber; rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber; /* Truncate the FSM first if it exists */ fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM); if (fsm) FreeSpaceMapTruncateRel(rel, nblocks); /* Truncate the visibility map too if it exists. */ vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM); if (vm) visibilitymap_truncate(rel, nblocks); /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay * likely isn't going to succeed in the truncation either, and cause a * PANIC. It's tempting to put a critical section here, but that cure * would be worse than the disease. It would turn a usually harmless * failure to truncate, that might spell trouble at WAL replay, into a * certain PANIC. */ if (!rel->rd_istemp) { /* * Make an XLOG entry reporting the file truncation. */ XLogRecPtr lsn; XLogRecData rdata; xl_smgr_truncate xlrec; xlrec.blkno = nblocks; xlrec.rnode = rel->rd_node; rdata.data = (char *) &xlrec; rdata.len = sizeof(xlrec); rdata.buffer = InvalidBuffer; rdata.next = NULL; lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE, &rdata); /* * Flush, because otherwise the truncation of the main relation might * hit the disk before the WAL record, and the truncation of the FSM * or visibility map. If we crashed during that window, we'd be left * with a truncated heap, but the FSM or visibility map would still * contain entries for the non-existent heap pages. */ if (fsm || vm) XLogFlush(lsn); } /* Do the real work */ smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks, rel->rd_istemp); }
/* * Read tuples in correct sort order from tuplesort, and load them into * btree leaves. */ static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { BTPageState *state = NULL; bool merge = (btspool2 != NULL); IndexTuple itup, itup2 = NULL; bool should_free, should_free2, load1; TupleDesc tupdes = RelationGetDescr(wstate->index); int i, keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey = NULL; if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge * btspool and btspool2. */ /* the preparation of merge */ itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free); itup2 = tuplesort_getindextuple(btspool2->sortstate, true, &should_free2); indexScanKey = _bt_mkscankey_nodata(wstate->index); for (;;) { load1 = true; /* load BTSpool next ? */ if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { for (i = 1; i <= keysz; i++) { ScanKey entry; Datum attrDatum1, attrDatum2; bool isNull1, isNull2; int32 compare; entry = indexScanKey + i - 1; attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); if (isNull1) { if (isNull2) compare = 0; /* NULL "=" NULL */ else if (entry->sk_flags & SK_BT_NULLS_FIRST) compare = -1; /* NULL "<" NOT_NULL */ else compare = 1; /* NULL ">" NOT_NULL */ } else if (isNull2) { if (entry->sk_flags & SK_BT_NULLS_FIRST) compare = 1; /* NOT_NULL ">" NULL */ else compare = -1; /* NOT_NULL "<" NULL */ } else { compare = DatumGetInt32(FunctionCall2Coll(&entry->sk_func, entry->sk_collation, attrDatum1, attrDatum2)); if (entry->sk_flags & SK_BT_DESC) compare = -compare; } if (compare > 0) { load1 = false; break; } else if (compare < 0) break; } } else load1 = false; /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free); } else { _bt_buildadd(wstate, state, itup2); if (should_free2) pfree(itup2); itup2 = tuplesort_getindextuple(btspool2->sortstate, true, &should_free2); } } _bt_freeskey(indexScanKey); } else { /* merge is unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free)) != NULL) { /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); } } /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index is WAL-logged, we must fsync it down to disk before it's * safe to commit the transaction. (For a non-WAL-logged index we don't * care since the index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (RelationNeedsWAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } }
/* * pg_prewarm(regclass, mode text, fork text, * first_block int8, last_block int8) * * The first argument is the relation to be prewarmed; the second controls * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'. * The third is the name of the relation fork to be prewarmed. The fourth * and fifth arguments specify the first and last block to be prewarmed. * If the fourth argument is NULL, it will be taken as 0; if the fifth argument * is NULL, it will be taken as the number of blocks in the relation. The * return value is the number of blocks successfully prewarmed. */ Datum pg_prewarm(PG_FUNCTION_ARGS) { Oid relOid; text *forkName; text *type; int64 first_block; int64 last_block; int64 nblocks; int64 blocks_done = 0; int64 block; Relation rel; ForkNumber forkNumber; char *forkString; char *ttype; PrewarmType ptype; AclResult aclresult; /* Basic sanity checking. */ if (PG_ARGISNULL(0)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("relation cannot be null"))); relOid = PG_GETARG_OID(0); if (PG_ARGISNULL(1)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("prewarm type cannot be null")))); type = PG_GETARG_TEXT_P(1); ttype = text_to_cstring(type); if (strcmp(ttype, "prefetch") == 0) ptype = PREWARM_PREFETCH; else if (strcmp(ttype, "read") == 0) ptype = PREWARM_READ; else if (strcmp(ttype, "buffer") == 0) ptype = PREWARM_BUFFER; else { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid prewarm type"), errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\"."))); PG_RETURN_INT64(0); /* Placate compiler. */ } if (PG_ARGISNULL(2)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("relation fork cannot be null")))); forkName = PG_GETARG_TEXT_P(2); forkString = text_to_cstring(forkName); forkNumber = forkname_to_number(forkString); /* Open relation and check privileges. */ rel = relation_open(relOid, AccessShareLock); aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, get_rel_name(relOid)); /* Check that the fork exists. */ RelationOpenSmgr(rel); if (!smgrexists(rel->rd_smgr, forkNumber)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("fork \"%s\" does not exist for this relation", forkString))); /* Validate block numbers, or handle nulls. */ nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber); if (PG_ARGISNULL(3)) first_block = 0; else { first_block = PG_GETARG_INT64(3); if (first_block < 0 || first_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("starting block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } if (PG_ARGISNULL(4)) last_block = nblocks - 1; else { last_block = PG_GETARG_INT64(4); if (last_block < 0 || last_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ending block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } /* Now we're ready to do the real work. */ if (ptype == PREWARM_PREFETCH) { #ifdef USE_PREFETCH /* * In prefetch mode, we just hint the OS to read the blocks, but we * don't know whether it really does it, and we don't wait for it to * finish. * * It would probably be better to pass our prefetch requests in chunks * of a megabyte or maybe even a whole segment at a time, but there's * no practical way to do that at present without a gross modularity * violation, so we just do this. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); PrefetchBuffer(rel, forkNumber, block); ++blocks_done; } #else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("prefetch is not supported by this build"))); #endif } else if (ptype == PREWARM_READ) { /* * In read mode, we actually read the blocks, but not into shared * buffers. This is more portable than prefetch mode (it works * everywhere) and is synchronous. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); smgrread(rel->rd_smgr, forkNumber, block, blockbuffer); ++blocks_done; } } else if (ptype == PREWARM_BUFFER) { /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { Buffer buf; CHECK_FOR_INTERRUPTS(); buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_done; } } /* Close relation, release lock. */ relation_close(rel, AccessShareLock); PG_RETURN_INT64(blocks_done); }
Relation DirectOpen_Open( DirectOpen *direct, Oid relationId, Oid tablespace, Oid database, Oid relfilenode, FormData_pg_class *pgClass, FormData_pg_attribute *attrArray, FormData_pg_am *pgAm, FormData_pg_index *pgIndex, int2 *indKeyArray, Oid *indClassArray, bool relHasOid) { int natts; int i; Assert(pgClass != NULL); natts = pgClass->relnatts; if (relationId == -1) relationId = pgClass->relfilenode; // Assume it is ok to use the relfilenode as the relationId in our limited usage. if (relfilenode == -1) relfilenode = pgClass->relfilenode; if (!direct->isInit) { /* * Lots of Hard-coded construction of the gp_persistent* RelationS and * dependent objects like tuple descriptors, etc. */ direct->relationData.rd_refcnt = 0; direct->relationData.rd_isvalid = true; direct->relationData.rd_id = relationId; direct->relationData.rd_rel = pgClass; if (pgIndex != NULL) { int pgIndexFixedLen = offsetof(FormData_pg_index, indkey); int indKeyVectorLen = Int2VectorSize(natts); int2vector *indKeyVector; oidvector *indClassVector; uint16 amstrategies; uint16 amsupport; Oid *operator; RegProcedure *support; FmgrInfo *supportinfo; Assert(pgAm != NULL); Assert(indKeyArray != NULL); Assert(indClassArray != NULL); /* * Allocate Formdata_pg_index with fields through indkey * where indkey is a variable length int2vector with indKeyArray values. */ direct->relationData.rd_index = (FormData_pg_index*)palloc( pgIndexFixedLen + indKeyVectorLen); memcpy(direct->relationData.rd_index, pgIndex, pgIndexFixedLen); indKeyVector = buildint2vector( indKeyArray, natts); memcpy( &direct->relationData.rd_index->indkey, indKeyVector, indKeyVectorLen); pfree(indKeyVector); direct->relationData.rd_am = pgAm; amstrategies = pgAm->amstrategies; amsupport = pgAm->amsupport; direct->relationData.rd_indexcxt = TopMemoryContext; /* * Allocate arrays to hold data */ direct->relationData.rd_aminfo = (RelationAmInfo *) MemoryContextAllocZero(TopMemoryContext, sizeof(RelationAmInfo)); direct->relationData.rd_opfamily = (Oid *) MemoryContextAllocZero(TopMemoryContext, natts * sizeof(Oid)); direct->relationData.rd_opcintype = (Oid *) MemoryContextAllocZero(TopMemoryContext, natts * sizeof(Oid)); if (amstrategies > 0) operator = (Oid *) MemoryContextAllocZero(TopMemoryContext, natts * amstrategies * sizeof(Oid)); else operator = NULL; if (amsupport > 0) { int nsupport = natts * amsupport; support = (RegProcedure *) MemoryContextAllocZero(TopMemoryContext, nsupport * sizeof(RegProcedure)); supportinfo = (FmgrInfo *) MemoryContextAllocZero(TopMemoryContext, nsupport * sizeof(FmgrInfo)); } else { support = NULL; supportinfo = NULL; } direct->relationData.rd_operator = operator; direct->relationData.rd_support = support; direct->relationData.rd_supportinfo = supportinfo; direct->relationData.rd_indoption = (int16 *) MemoryContextAllocZero(TopMemoryContext, natts * sizeof(int16)); /* * Create oidvector in rd_indclass with values from indClassArray. */ indClassVector = buildoidvector(indClassArray, natts); /* * Fill the operator and support procedure OID arrays. (aminfo and * supportinfo are left as zeroes, and are filled on-the-fly when used) */ IndexSupportInitialize(indClassVector, operator, support, direct->relationData.rd_opfamily, direct->relationData.rd_opcintype, amstrategies, amsupport, natts); /* * expressions and predicate cache will be filled later. */ direct->relationData.rd_indexprs = NIL; direct->relationData.rd_indpred = NIL; direct->relationData.rd_amcache = NULL; } // Not much in terms of contraints. direct->constrData.has_not_null = true; /* * Setup tuple descriptor for columns. */ direct->descData.natts = pgClass->relnatts; // Make the array of pointers. direct->descData.attrs = (Form_pg_attribute*) MemoryContextAllocZero( TopMemoryContext, sizeof(Form_pg_attribute*) * pgClass->relnatts); for (i = 0; i < pgClass->relnatts; i++) { direct->descData.attrs[i] = (Form_pg_attribute) MemoryContextAllocZero( TopMemoryContext, sizeof(FormData_pg_attribute)); memcpy(direct->descData.attrs[i], &(attrArray[i]), sizeof(FormData_pg_attribute)); // Patch up relation id. direct->descData.attrs[i]->attrelid = relationId; } direct->descData.constr = &direct->constrData; direct->descData.tdtypeid = pgClass->reltype; direct->descData.tdtypmod = -1; direct->descData.tdqdtypmod = -1; direct->descData.tdhasoid = relHasOid; direct->descData.tdrefcount = 1; direct->relationData.rd_att = &direct->descData; direct->pgStat.t_id = relationId; direct->pgStat.t_shared = 1; direct->relationData.pgstat_info = &direct->pgStat; direct->isInit = true; } // UNDONE: Should verify for NON-SHARED relations we don't open relations in different databases / or // UNDONE: open different relations in same database at same time !!! direct->relationData.rd_node.spcNode = tablespace; direct->relationData.rd_node.dbNode = database; direct->relationData.rd_node.relNode = relfilenode; direct->relationData.rd_targblock = InvalidBlockNumber; for (i = 0; i < direct->relationData.rd_rel->relnatts; i++) { Assert(direct->descData.attrs[i] != NULL); // Patch up relation id. direct->descData.attrs[i]->attrelid = direct->relationData.rd_id; } direct->relationData.rd_refcnt++; RelationOpenSmgr(&direct->relationData); return &direct->relationData; }