Example #1
0
/*
 * SetMatViewToPopulated
 *		Indicate that the materialized view has been populated by its query.
 *
 * NOTE: The heap starts out in a state that doesn't look scannable, and can
 * only transition from there to scannable at the time a new heap is created.
 *
 * NOTE: caller must be holding an appropriate lock on the relation.
 */
void
SetMatViewToPopulated(Relation relation)
{
	Page        page;

	Assert(relation->rd_rel->relkind == RELKIND_MATVIEW);
	Assert(relation->rd_ispopulated == false);

	page = (Page) palloc(BLCKSZ);
	PageInit(page, BLCKSZ, 0);

	if (RelationNeedsWAL(relation))
		log_newpage(&(relation->rd_node), MAIN_FORKNUM, 0, page);

	RelationOpenSmgr(relation);

	PageSetChecksumInplace(page, 0);
	smgrextend(relation->rd_smgr, MAIN_FORKNUM, 0, (char *) page, true);

	pfree(page);

	smgrimmedsync(relation->rd_smgr, MAIN_FORKNUM);

	RelationCacheInvalidateEntry(relation->rd_id);
}
Example #2
0
/*
 * Remove the visibility map fork for a relation.  If there turn out to be
 * any bugs in the visibility map code that require rebuilding the VM, this
 * provides users with a way to do it that is cleaner than shutting down the
 * server and removing files by hand.
 *
 * This is a cut-down version of RelationTruncate.
 */
Datum
pg_truncate_visibility_map(PG_FUNCTION_ARGS)
{
    Oid			relid = PG_GETARG_OID(0);
    Relation	rel;

    rel = relation_open(relid, AccessExclusiveLock);

    if (rel->rd_rel->relkind != RELKIND_RELATION &&
            rel->rd_rel->relkind != RELKIND_MATVIEW &&
            rel->rd_rel->relkind != RELKIND_TOASTVALUE)
        ereport(ERROR,
                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                 errmsg("\"%s\" is not a table, materialized view, or TOAST table",
                        RelationGetRelationName(rel))));

    RelationOpenSmgr(rel);
    rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;

    visibilitymap_truncate(rel, 0);

    if (RelationNeedsWAL(rel))
    {
        xl_smgr_truncate xlrec;

        xlrec.blkno = 0;
        xlrec.rnode = rel->rd_node;
        xlrec.flags = SMGR_TRUNCATE_VM;

        XLogBeginInsert();
        XLogRegisterData((char *) &xlrec, sizeof(xlrec));

        XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
    }

    /*
     * Release the lock right away, not at commit time.
     *
     * It would be a problem to release the lock prior to commit if this
     * truncate operation sends any transactional invalidation messages. Other
     * backends would potentially be able to lock the relation without
     * processing them in the window of time between when we release the lock
     * here and when we sent the messages at our eventual commit.  However,
     * we're currently only sending a non-transactional smgr invalidation,
     * which will have been posted to shared memory immediately from within
     * visibilitymap_truncate.  Therefore, there should be no race here.
     *
     * The reason why it's desirable to release the lock early here is because
     * of the possibility that someone will need to use this to blow away many
     * visibility map forks at once.  If we can't release the lock until
     * commit time, the transaction doing this will accumulate
     * AccessExclusiveLocks on all of those relations at the same time, which
     * is undesirable. However, if this turns out to be unsafe we may have no
     * choice...
     */
    relation_close(rel, AccessExclusiveLock);

    /* Nothing to return. */
    PG_RETURN_VOID();
}
Example #3
0
/*
 * Ensure that the visibility map fork is at least vm_nblocks long, extending
 * it if necessary with zeroed pages.
 */
static void
vm_extend(Relation rel, BlockNumber vm_nblocks)
{
	BlockNumber vm_nblocks_now;
	Page		pg;

	pg = (Page) palloc(BLCKSZ);
	PageInit(pg, BLCKSZ, 0);

	/*
	 * We use the relation extension lock to lock out other backends trying to
	 * extend the visibility map at the same time. It also locks out extension
	 * of the main fork, unnecessarily, but extending the visibility map
	 * happens seldom enough that it doesn't seem worthwhile to have a
	 * separate lock tag type for it.
	 *
	 * Note that another backend might have extended or created the relation
	 * by the time we get the lock.
	 */
	LockRelationForExtension(rel, ExclusiveLock);

	/* Might have to re-open if a cache flush happened */
	RelationOpenSmgr(rel);

	/*
	 * Create the file first if it doesn't exist.  If smgr_vm_nblocks is
	 * positive then it must exist, no need for an smgrexists call.
	 */
	if ((rel->rd_smgr->smgr_vm_nblocks == 0 ||
		 rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) &&
		!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
		smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);

	vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);

	/* Now extend the file */
	while (vm_nblocks_now < vm_nblocks)
	{
		smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
				   (char *) pg, false);
		vm_nblocks_now++;
	}

	/*
	 * Send a shared-inval message to force other backends to close any smgr
	 * references they may have for this rel, which we are about to change.
	 * This is a useful optimization because it means that backends don't have
	 * to keep checking for creation or extension of the file, which happens
	 * infrequently.
	 */
	CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode);

	/* Update local cache with the up-to-date size */
	rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now;

	UnlockRelationForExtension(rel, ExclusiveLock);

	pfree(pg);
}
Example #4
0
/*
 * FreeSpaceMapTruncateRel - adjust for truncation of a relation.
 *
 * The caller must hold AccessExclusiveLock on the relation, to ensure that
 * other backends receive the smgr invalidation event that this function sends
 * before they access the FSM again.
 *
 * nblocks is the new___ size of the heap.
 */
void
FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
{
	BlockNumber new_nfsmblocks;
	FSMAddress	first_removed_address;
	uint16		first_removed_slot;
	Buffer		buf;

	RelationOpenSmgr(rel);

	/*
	 * If no FSM has been created yet for this relation, there's nothing to
	 * truncate.
	 */
	if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
		return;

	/* Get the location in the FSM of the first removed heap block */
	first_removed_address = fsm_get_location(nblocks, &first_removed_slot);

	/*
	 * Zero out the tail of the last remaining FSM page. If the slot
	 * representing the first removed heap block is at a page boundary, as the
	 * first slot on the FSM page that first_removed_address points to, we can
	 * just truncate that page altogether.
	 */
	if (first_removed_slot > 0)
	{
		buf = fsm_readbuf(rel, first_removed_address, false);
		if (!BufferIsValid(buf))
			return;				/* nothing to do; the FSM was already smaller */
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
		fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
		MarkBufferDirtyHint(buf, false);
		UnlockReleaseBuffer(buf);

		new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
	}
	else
	{
		new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
		if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
			return;				/* nothing to do; the FSM was already smaller */
	}

	/* Truncate the unused FSM pages, and send smgr inval message */
	smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks);

	/*
	 * We might as well update the local smgr_fsm_nblocks setting.
	 * smgrtruncate sent an smgr cache inval message, which will cause other
	 * backends to invalidate their copy of smgr_fsm_nblocks, and this one too
	 * at the next command boundary.  But this ensures it isn't outright wrong
	 * until then.
	 */
	if (rel->rd_smgr)
		rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks;
}
Example #5
0
/*
 * emit a completed btree page, and release the working storage.
 */
static void
_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
{
	/* Ensure rd_smgr is open (could have been closed by relcache flush!) */
	RelationOpenSmgr(wstate->index);

	/* XLOG stuff */
	if (wstate->btws_use_wal)
	{
		/* We use the heap NEWPAGE record type for this */
		log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page);
	}
	else
	{
		/* Leave the page LSN zero if not WAL-logged, but set TLI anyway */
		PageSetTLI(page, ThisTimeLineID);
	}

	/*
	 * If we have to write pages nonsequentially, fill in the space with
	 * zeroes until we come back and overwrite.  This is not logically
	 * necessary on standard Unix filesystems (unwritten space will read as
	 * zeroes anyway), but it should help to avoid fragmentation. The dummy
	 * pages aren't WAL-logged though.
	 */
	while (blkno > wstate->btws_pages_written)
	{
		if (!wstate->btws_zeropage)
			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
		smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
				   wstate->btws_pages_written++,
				   (char *) wstate->btws_zeropage,
				   true);
	}

	/*
	 * Now write the page.	We say isTemp = true even if it's not a temp
	 * index, because there's no need for smgr to schedule an fsync for this
	 * write; we'll do it ourselves before ending the build.
	 */
	if (blkno == wstate->btws_pages_written)
	{
		/* extending the file... */
		smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
				   (char *) page, true);
		wstate->btws_pages_written++;
	}
	else
	{
		/* overwriting a block we zero-filled before */
		smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
				  (char *) page, true);
	}

	pfree(page);
}
Example #6
0
/*
 * End a rewrite.
 *
 * state and any other resources are freed.
 */
void
end_heap_rewrite(RewriteState state)
{
	HASH_SEQ_STATUS seq_status;
	UnresolvedTup unresolved;

	/*
	 * Write any remaining tuples in the UnresolvedTups table. If we have any
	 * left, they should in fact be dead, but let's err on the safe side.
	 */
	hash_seq_init(&seq_status, state->rs_unresolved_tups);

	while ((unresolved = hash_seq_search(&seq_status)) != NULL)
	{
		ItemPointerSetInvalid(&unresolved->tuple->t_data->t_ctid);
		raw_heap_insert(state, unresolved->tuple);
	}

	/* Write the last page, if any */
	if (state->rs_buffer_valid)
	{
		if (state->rs_use_wal)
			log_newpage(&state->rs_new_rel->rd_node,
						MAIN_FORKNUM,
						state->rs_blockno,
						state->rs_buffer,
						true);
		RelationOpenSmgr(state->rs_new_rel);

		PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);

		smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno,
				   (char *) state->rs_buffer, true);
	}

	/*
	 * If the rel is WAL-logged, must fsync before commit.	We use heap_sync
	 * to ensure that the toast table gets fsync'd too.
	 *
	 * It's obvious that we must do this when not WAL-logging. It's less
	 * obvious that we have to do it even if we did WAL-log the pages. The
	 * reason is the same as in tablecmds.c's copy_relation_data(): we're
	 * writing data that's not in shared buffers, and so a CHECKPOINT
	 * occurring during the rewriteheap operation won't have fsync'd data we
	 * wrote before the checkpoint.
	 */
	if (RelationNeedsWAL(state->rs_new_rel))
		heap_sync(state->rs_new_rel);

	/* Deleting the context frees everything */
	MemoryContextDelete(state->rs_cxt);
}
Example #7
0
/*
 * Ensure that the FSM fork is at least fsm_nblocks long, extending
 * it if necessary with empty pages. And by empty, I mean pages filled
 * with zeros, meaning there's no free space.
 */
static void
fsm_extend(Relation rel, BlockNumber fsm_nblocks)
{
	BlockNumber fsm_nblocks_now;
	Page		pg;

	pg = (Page) palloc(BLCKSZ);
	PageInit(pg, BLCKSZ, 0);

	/*
	 * We use the relation extension lock to lock out other backends trying to
	 * extend the FSM at the same time. It also locks out extension of the
	 * main fork, unnecessarily, but extending the FSM happens seldom enough
	 * that it doesn't seem worthwhile to have a separate lock tag type for
	 * it.
	 *
	 * Note that another backend might have extended or created the relation
	 * by the time we get the lock.
	 */
	LockRelationForExtension(rel, ExclusiveLock);

	/* Might have to re-open if a cache flush happened */
	RelationOpenSmgr(rel);

	/*
	 * Create the FSM file first if it doesn't exist.  If smgr_fsm_nblocks is
	 * positive then it must exist, no need for an smgrexists call.
	 */
	if ((rel->rd_smgr->smgr_fsm_nblocks == 0 ||
		 rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber) &&
		!smgrexists(rel->rd_smgr, FSM_FORKNUM))
		smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);

	fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);

	while (fsm_nblocks_now < fsm_nblocks)
	{
		PageSetChecksumInplace(pg, fsm_nblocks_now);

		smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
				   (char *) pg, false);
		fsm_nblocks_now++;
	}

	/* Update local cache with the up-to-date size */
	rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now;

	UnlockRelationForExtension(rel, ExclusiveLock);

	pfree(pg);
}
Example #8
0
/*
 * For heaps, we prevent creation of the FSM unless the number of pages
 * exceeds HEAP_FSM_CREATION_THRESHOLD.  For tables that don't already have
 * a FSM, this will save an inode and a few kB of space.
 *
 * XXX The API is a little awkward -- if the caller passes a valid nblocks
 * value, it can avoid invoking a system call.  If the caller passes
 * InvalidBlockNumber and receives a false return value, it can get an
 * up-to-date relation size from get_nblocks.  This saves a few cycles in
 * the caller, which would otherwise need to get the relation size by itself.
 */
static bool
fsm_allow_writes(Relation rel, BlockNumber heapblk,
				 BlockNumber nblocks, BlockNumber *get_nblocks)
{
	bool		skip_get_nblocks;

	if (heapblk >= HEAP_FSM_CREATION_THRESHOLD)
		return true;

	/* Non-heap rels can always create a FSM. */
	if (rel->rd_rel->relkind != RELKIND_RELATION &&
		rel->rd_rel->relkind != RELKIND_TOASTVALUE)
		return true;

	/*
	 * If the caller knows nblocks, we can avoid a system call later. If it
	 * doesn't, maybe we have relpages from a previous VACUUM. Since the table
	 * may have extended since then, we still have to count the pages later if
	 * we can't return now.
	 */
	if (nblocks != InvalidBlockNumber)
	{
		if (nblocks > HEAP_FSM_CREATION_THRESHOLD)
			return true;
		else
			skip_get_nblocks = true;
	}
	else
	{
		if (rel->rd_rel->relpages != InvalidBlockNumber &&
			rel->rd_rel->relpages > HEAP_FSM_CREATION_THRESHOLD)
			return true;
		else
			skip_get_nblocks = false;
	}

	RelationOpenSmgr(rel);
	if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
		return true;

	if (skip_get_nblocks)
		return false;

	/* last resort */
	*get_nblocks = RelationGetNumberOfBlocks(rel);
	if (*get_nblocks > HEAP_FSM_CREATION_THRESHOLD)
		return true;
	else
		return false;
}
/*
 * Read a visibility map page.
 *
 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
 * true, the visibility map file is extended.
 */
static Buffer
vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
{
	Buffer		buf;

	/*
	 * We might not have opened the relation at the smgr level yet, or we
	 * might have been forced to close it by a sinval message.  The code below
	 * won't necessarily notice relation extension immediately when extend =
	 * false, so we rely on sinval messages to ensure that our ideas about the
	 * size of the map aren't too far out of date.
	 */
	RelationOpenSmgr(rel);

	/*
	 * If we haven't cached the size of the visibility map fork yet, check it
	 * first.
	 */
	if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber)
	{
		if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
			rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr,
													  VISIBILITYMAP_FORKNUM);
		else
			rel->rd_smgr->smgr_vm_nblocks = 0;
	}

	/* Handle requests beyond EOF */
	if (blkno >= rel->rd_smgr->smgr_vm_nblocks)
	{
		if (extend)
			vm_extend(rel, blkno + 1);
		else
			return InvalidBuffer;
	}

	/*
	 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's
	 * always safe to clear bits, so it's better to clear corrupt pages than
	 * error out.
	 */
	buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno,
							 RBM_ZERO_ON_ERROR, NULL);
	if (PageIsNew(BufferGetPage(buf)))
		PageInit(BufferGetPage(buf), BLCKSZ, 0);
	return buf;
}
Example #10
0
/*
 * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
 *
 * This does not need to initialize the new bucket pages; we'll do that as
 * each one is used by _hash_expandtable().  But we have to extend the logical
 * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
 * sync with ours, so that we don't get complaints from smgr.
 *
 * We do this by writing a page of zeroes at the end of the splitpoint range.
 * We expect that the filesystem will ensure that the intervening pages read
 * as zeroes too.  On many filesystems this "hole" will not be allocated
 * immediately, which means that the index file may end up more fragmented
 * than if we forced it all to be allocated now; but since we don't scan
 * hash indexes sequentially anyway, that probably doesn't matter.
 *
 * XXX It's annoying that this code is executed with the metapage lock held.
 * We need to interlock against _hash_addovflpage() adding a new overflow page
 * concurrently, but it'd likely be better to use LockRelationForExtension
 * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
 * so it may not be worth worrying about.
 *
 * Returns TRUE if successful, or FALSE if allocation failed due to
 * BlockNumber overflow.
 */
static bool
_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
{
	BlockNumber lastblock;
	char		zerobuf[BLCKSZ];
	Page		page;
	HashPageOpaque ovflopaque;

	lastblock = firstblock + nblocks - 1;

	/*
	 * Check for overflow in block number calculation; if so, we cannot extend
	 * the index anymore.
	 */
	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
		return false;

	page = (Page) zerobuf;

	/*
	 * Initialize the page.  Just zeroing the page won't work; see
	 * _hash_freeovflpage for similar usage.  We take care to make the special
	 * space valid for the benefit of tools such as pageinspect.
	 */
	_hash_pageinit(page, BLCKSZ);

	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page);

	ovflopaque->hasho_prevblkno = InvalidBlockNumber;
	ovflopaque->hasho_nextblkno = InvalidBlockNumber;
	ovflopaque->hasho_bucket = -1;
	ovflopaque->hasho_flag = LH_UNUSED_PAGE;
	ovflopaque->hasho_page_id = HASHO_PAGE_ID;

	if (RelationNeedsWAL(rel))
		log_newpage(&rel->rd_node,
					MAIN_FORKNUM,
					lastblock,
					zerobuf,
					true);

	RelationOpenSmgr(rel);
	smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false);

	return true;
}
Example #11
0
/*
 * Read a FSM page.
 *
 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
 * true, the FSM file is extended.
 */
static Buffer
fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
{
	BlockNumber blkno = fsm_logical_to_physical(addr);
	Buffer		buf;

	RelationOpenSmgr(rel);

	/*
	 * If we haven't cached the size of the FSM yet, check it first.  Also
	 * recheck if the requested block seems to be past end, since our cached
	 * value might be stale.  (We send smgr inval messages on truncation, but
	 * not on extension.)
	 */
	if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber ||
		blkno >= rel->rd_smgr->smgr_fsm_nblocks)
	{
		if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
			rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr,
														 FSM_FORKNUM);
		else
			rel->rd_smgr->smgr_fsm_nblocks = 0;
	}

	/* Handle requests beyond EOF */
	if (blkno >= rel->rd_smgr->smgr_fsm_nblocks)
	{
		if (extend)
			fsm_extend(rel, blkno + 1);
		else
			return InvalidBuffer;
	}

	/*
	 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
	 * information is not accurate anyway, so it's better to clear corrupt
	 * pages than error out. Since the FSM changes are not WAL-logged, the
	 * so-called torn page problem on crash can lead to pages with corrupt
	 * headers, for example.
	 */
	buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
	if (PageIsNew(BufferGetPage(buf)))
		PageInit(BufferGetPage(buf), BLCKSZ, 0);
	return buf;
}
Example #12
0
/*
 * Read a visibility map page.
 *
 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
 * true, the visibility map file is extended.
 */
static Buffer
vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
{
	Buffer		buf;

	RelationOpenSmgr(rel);

	/*
	 * If we haven't cached the size of the visibility map fork yet, check it
	 * first.  Also recheck if the requested block seems to be past end, since
	 * our cached value might be stale.  (We send smgr inval messages on
	 * truncation, but not on extension.)
	 */
	if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber ||
		blkno >= rel->rd_smgr->smgr_vm_nblocks)
	{
		if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
			rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr,
													  VISIBILITYMAP_FORKNUM);
		else
			rel->rd_smgr->smgr_vm_nblocks = 0;
	}

	/* Handle requests beyond EOF */
	if (blkno >= rel->rd_smgr->smgr_vm_nblocks)
	{
		if (extend)
			vm_extend(rel, blkno + 1);
		else
			return InvalidBuffer;
	}

	/*
	 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's
	 * always safe to clear bits, so it's better to clear corrupt pages than
	 * error out.
	 */
	buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno,
							 RBM_ZERO_ON_ERROR, NULL);
	if (PageIsNew(BufferGetPage(buf)))
		PageInit(BufferGetPage(buf), BLCKSZ, 0);
	return buf;
}
Example #13
0
/*
 * Read a visibility map page.
 *
 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
 * true, the visibility map file is extended.
 */
static Buffer
vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
{
	Buffer		buf;

	RelationOpenSmgr(rel);

	/*
	 * The current size of the visibility map fork is kept in relcache, to
	 * avoid reading beyond EOF. If we haven't cached the size of the map yet,
	 * do that first.
	 */
	if (rel->rd_vm_nblocks == InvalidBlockNumber)
	{
		if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
			rel->rd_vm_nblocks = smgrnblocks(rel->rd_smgr,
											 VISIBILITYMAP_FORKNUM);
		else
			rel->rd_vm_nblocks = 0;
	}

	/* Handle requests beyond EOF */
	if (blkno >= rel->rd_vm_nblocks)
	{
		if (extend)
			vm_extend(rel, blkno + 1);
		else
			return InvalidBuffer;
	}

	/*
	 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's
	 * always safe to clear bits, so it's better to clear corrupt pages than
	 * error out.
	 */
	buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno,
							 RBM_ZERO_ON_ERROR, NULL);
	if (PageIsNew(BufferGetPage(buf)))
		PageInit(BufferGetPage(buf), BLCKSZ, 0);
	return buf;
}
Example #14
0
/*
 * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
 *
 * This does not need to initialize the new bucket pages; we'll do that as
 * each one is used by _hash_expandtable().  But we have to extend the logical
 * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
 * sync with ours, so that we don't get complaints from smgr.
 *
 * We do this by writing a page of zeroes at the end of the splitpoint range.
 * We expect that the filesystem will ensure that the intervening pages read
 * as zeroes too.  On many filesystems this "hole" will not be allocated
 * immediately, which means that the index file may end up more fragmented
 * than if we forced it all to be allocated now; but since we don't scan
 * hash indexes sequentially anyway, that probably doesn't matter.
 *
 * XXX It's annoying that this code is executed with the metapage lock held.
 * We need to interlock against _hash_getovflpage() adding a new overflow page
 * concurrently, but it'd likely be better to use LockRelationForExtension
 * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
 * so it may not be worth worrying about.
 *
 * Returns TRUE if successful, or FALSE if allocation failed due to
 * BlockNumber overflow.
 */
static bool
_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
{
    BlockNumber lastblock;
    char		zerobuf[BLCKSZ];

    lastblock = firstblock + nblocks - 1;

    /*
     * Check for overflow in block number calculation; if so, we cannot extend
     * the index anymore.
     */
    if (lastblock < firstblock || lastblock == InvalidBlockNumber)
        return false;

    MemSet(zerobuf, 0, sizeof(zerobuf));

    RelationOpenSmgr(rel);
    smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false);

    return true;
}
Example #15
0
/*
 * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
 *
 * This does not need to initialize the new bucket pages; we'll do that as
 * each one is used by _hash_expandtable().  But we have to extend the logical
 * EOF to the end of the splitpoint; otherwise the first overflow page
 * allocated beyond the splitpoint will represent a noncontiguous access,
 * which can confuse md.c (and will probably be forbidden by future changes
 * to md.c).
 *
 * We do this by writing a page of zeroes at the end of the splitpoint range.
 * We expect that the filesystem will ensure that the intervening pages read
 * as zeroes too.  On many filesystems this "hole" will not be allocated
 * immediately, which means that the index file may end up more fragmented
 * than if we forced it all to be allocated now; but since we don't scan
 * hash indexes sequentially anyway, that probably doesn't matter.
 *
 * XXX It's annoying that this code is executed with the metapage lock held.
 * We need to interlock against _hash_getovflpage() adding a new overflow page
 * concurrently, but it'd likely be better to use LockRelationForExtension
 * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
 * so it may not be worth worrying about.
 *
 * Returns TRUE if successful, or FALSE if allocation failed due to
 * BlockNumber overflow.
 */
static bool
_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
{
	BlockNumber	lastblock;
	BlockNumber	endblock;
	char		zerobuf[BLCKSZ];

	lastblock = firstblock + nblocks - 1;

	/*
	 * Check for overflow in block number calculation; if so, we cannot
	 * extend the index anymore.
	 */
	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
		return false;

	MemSet(zerobuf, 0, sizeof(zerobuf));

	RelationOpenSmgr(rel);

	/*
	 * XXX If the extension results in creation of new segment files,
	 * we have to make sure that each non-last file is correctly filled out to
	 * RELSEG_SIZE blocks.  This ought to be done inside mdextend, but
	 * changing the smgr API seems best left for development cycle not late
	 * beta.  Temporary fix for bug #2737.
	 */
#ifndef LET_OS_MANAGE_FILESIZE
	for (endblock = firstblock | (RELSEG_SIZE - 1);
		 endblock < lastblock;
		 endblock += RELSEG_SIZE)
		smgrextend(rel->rd_smgr, endblock, zerobuf, rel->rd_istemp);
#endif

	smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp);

	return true;
}
Example #16
0
static void
ReadBlocks(int filenum)
{
	FILE	   *file;
	char		record_type;
	char	   *dbname;
	Oid			record_filenode;
	ForkNumber	record_forknum;
	BlockNumber	record_blocknum;
	BlockNumber	record_range;

	int			log_level		= DEBUG3;
	Oid			relOid			= InvalidOid;
	Relation	rel				= NULL;
	bool		skip_relation	= false;
	bool		skip_fork		= false;
	bool		skip_block		= false;
	BlockNumber	nblocks			= 0;
	BlockNumber	blocks_restored	= 0;
	const char *filepath;

	/*
	 * If this condition changes, then this code, and the code in the writer
	 * will need to be changed; especially the format specifiers in log and
	 * error messages.
	 */
	StaticAssertStmt(MaxBlockNumber == 0xFFFFFFFE, "Code may need review.");

	filepath = getSavefileName(filenum);
	file = fileOpen(filepath, PG_BINARY_R);
	dbname = readDBName(file, filepath);

	/*
	 * When restoring global objects, the dbname is zero-length string, and non-
	 * zero length otherwise. And filenum is never expected to be smaller than 1.
	 */
	Assert(filenum >= 1);
	Assert(filenum == 1 ? strlen(dbname) == 0 : strlen(dbname) > 0);

	/* To restore the global objects, use default database */
	BackgroundWorkerInitializeConnection(filenum == 1 ? guc_default_database : dbname, NULL);
	SetCurrentStatementStartTimestamp();
	StartTransactionCommand();
	SPI_connect();
	PushActiveSnapshot(GetTransactionSnapshot());
	pgstat_report_activity(STATE_RUNNING, "restoring buffers");

	/*
	 * Note that in case of a read error, we will leak relcache entry that we may
	 * currently have open. In case of EOF, we close the relation after the loop.
	 */
	while (fileRead(&record_type, 1, file, true, filepath))
	{
		/*
		 * If we want to process the signals, this seems to be the best place
		 * to do it. Generally the backends refrain from processing config file
		 * while in transaction, but that's more for the fear of allowing GUC
		 * changes to affect expression evaluation, causing different results
		 * for the same expression in a transaction. Since this worker is not
		 * processing any queries, it is okay to process the config file here.
		 *
		 * Even though it's okay to process SIGHUP here, doing so doesn't add
		 * any value. The only reason we might want to process config file here
		 * would be to allow the user to interrupt the BlockReader's operation
		 * by changing this extenstion's GUC parameter. But the user can do that
		 * anyway, using SIGTERM or pg_terminate_backend().
		 */

		/* Stop processing the save-file if the Postmaster wants us to die. */
		if (got_sigterm)
			break;

		ereport(log_level,
				(errmsg("record type %x - %c", record_type, record_type)));

		switch (record_type)
		{
			case 'r':
			{
				/* Close the previous relation, if any. */
				if (rel)
				{
					relation_close(rel, AccessShareLock);
					rel = NULL;
				}

				record_forknum = InvalidForkNumber;
				record_blocknum = InvalidBlockNumber;
				nblocks = 0;

				fileRead(&record_filenode, sizeof(Oid), file, false, filepath);

				relOid = GetRelOid(record_filenode);

				ereport(log_level, (errmsg("processing filenode %u, relation %u",
										record_filenode, relOid)));
				/*
				 * If the relation has been rewritten/dropped since we saved it,
				 * just skip it and process the next relation.
				 */
				if (relOid == InvalidOid)
					skip_relation = true;
				else
				{
					skip_relation = false;

					/* Open the relation */
					rel = relation_open(relOid, AccessShareLock);
					RelationOpenSmgr(rel);
				}
			}
			break;
			case 'f':
			{
				record_blocknum = InvalidBlockNumber;
				nblocks = 0;

				fileRead(&record_forknum, sizeof(ForkNumber), file, false, filepath);

				if (skip_relation)
					continue;

				if (rel == NULL)
					ereport(ERROR,
							(errmsg("found a fork record without a preceeding relation record")));

				ereport(log_level, (errmsg("processing fork %d", record_forknum)));

				if (!smgrexists(rel->rd_smgr, record_forknum))
					skip_fork = true;
				else
				{
					skip_fork = false;

					nblocks = RelationGetNumberOfBlocksInFork(rel, record_forknum);
				}
			}
			break;
			case 'b':
			{
				if (record_forknum == InvalidForkNumber)
					ereport(ERROR,
							(errmsg("found a block record without a preceeding fork record")));

				fileRead(&record_blocknum, sizeof(BlockNumber), file, false, filepath);

				if (skip_relation || skip_fork)
					continue;

				/*
				 * Don't try to read past the file; the file may have been shrunk
				 * by a vaccum/truncate operation.
				 */
				if (record_blocknum >= nblocks)
				{
					ereport(log_level,
							(errmsg("reader %d skipping block filenode %u forknum %d blocknum %u",
									filenum, record_filenode, record_forknum, record_blocknum)));

					skip_block = true;
					continue;
				}
				else
				{
					Buffer	buf;

					skip_block = false;

					ereport(log_level,
							(errmsg("reader %d reading block filenode %u forknum %d blocknum %u",
									filenum, record_filenode, record_forknum, record_blocknum)));

					buf = ReadBufferExtended(rel, record_forknum, record_blocknum, RBM_NORMAL, NULL);
					ReleaseBuffer(buf);

					++blocks_restored;
				}
			}
			break;
			case 'N':
			{
				BlockNumber block;

				Assert(record_blocknum != InvalidBlockNumber);

				if (record_blocknum == InvalidBlockNumber)
					ereport(ERROR,
							(errmsg("found a block range record without a preceeding block record")));

				fileRead(&record_range, sizeof(int), file, false, filepath);

				if (skip_relation || skip_fork || skip_block)
					continue;

				ereport(log_level,
						(errmsg("reader %d reading range filenode %u forknum %d blocknum %u range %u",
								filenum, record_filenode, record_forknum, record_blocknum, record_range)));

				for (block = record_blocknum + 1; block <= (record_blocknum + record_range); ++block)
				{
					Buffer	buf;

					/*
					* Don't try to read past the file; the file may have been
					* shrunk by a vaccum operation.
					*/
					if (block >= nblocks)
					{
						ereport(log_level,
								(errmsg("reader %d skipping block range filenode %u forknum %d start %u end %u",
										filenum, record_filenode, record_forknum,
										block, record_blocknum + record_range)));

						break;
					}

					buf = ReadBufferExtended(rel, record_forknum, block, RBM_NORMAL, NULL);
					ReleaseBuffer(buf);

					++blocks_restored;
				}
			}
			break;
			default:
			{
				ereport(ERROR,
						(errmsg("found unexpected save-file marker %x - %c)", record_type, record_type)));
				Assert(false);
			}
			break;
		}
	}

	if (rel)
		relation_close(rel, AccessShareLock);

	ereport(LOG,
			(errmsg("Block Reader %d: restored %u blocks",
					filenum, blocks_restored)));

	SPI_finish();
	PopActiveSnapshot();
	CommitTransactionCommand();
	pgstat_report_activity(STATE_IDLE, NULL);

	fileClose(file, filepath);

	/* Remove the save-file */
	if (remove(filepath) != 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				errmsg("error removing file \"%s\" : %m", filepath)));
}
Example #17
0
/*
 *	visibilitymap_truncate - truncate the visibility map
 *
 * The caller must hold AccessExclusiveLock on the relation, to ensure that
 * other backends receive the smgr invalidation event that this function sends
 * before they access the VM again.
 *
 * nheapblocks is the new size of the heap.
 */
void
visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
{
	BlockNumber newnblocks;

	/* last remaining block, byte, and bit */
	BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
	uint32		truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
	uint8		truncBit = HEAPBLK_TO_MAPBIT(nheapblocks);

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);
#endif

	RelationOpenSmgr(rel);

	/*
	 * If no visibility map has been created yet for this relation, there's
	 * nothing to truncate.
	 */
	if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
		return;

	/*
	 * Unless the new size is exactly at a visibility map page boundary, the
	 * tail bits in the last remaining map page, representing truncated heap
	 * blocks, need to be cleared. This is not only tidy, but also necessary
	 * because we don't get a chance to clear the bits if the heap is extended
	 * again.
	 */
	if (truncByte != 0 || truncBit != 0)
	{
		Buffer		mapBuffer;
		Page		page;
		char	   *map;

		newnblocks = truncBlock + 1;

		mapBuffer = vm_readbuf(rel, truncBlock, false);
		if (!BufferIsValid(mapBuffer))
		{
			/* nothing to do, the file was already smaller */
			return;
		}

		page = BufferGetPage(mapBuffer);
		map = PageGetContents(page);

		LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);

		/* Clear out the unwanted bytes. */
		MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));

		/*
		 * Mask out the unwanted bits of the last remaining byte.
		 *
		 * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) -
		 * 1) = 00111111 ((1 << 7) - 1) = 01111111
		 */
		map[truncByte] &= (1 << truncBit) - 1;

		MarkBufferDirty(mapBuffer);
		UnlockReleaseBuffer(mapBuffer);
	}
	else
		newnblocks = truncBlock;

	if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks)
	{
		/* nothing to do, the file was already smaller than requested size */
		return;
	}

	/* Truncate the unused VM pages, and send smgr inval message */
	smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks);

	/*
	 * We might as well update the local smgr_vm_nblocks setting. smgrtruncate
	 * sent an smgr cache inval message, which will cause other backends to
	 * invalidate their copy of smgr_vm_nblocks, and this one too at the next
	 * command boundary.  But this ensures it isn't outright wrong until then.
	 */
	if (rel->rd_smgr)
		rel->rd_smgr->smgr_vm_nblocks = newnblocks;
}
Example #18
0
/*
 * Read tuples in correct sort order from tuplesort, and load them into
 * btree leaves.
 */
static void
_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
{
	BTPageState *state = NULL;
	bool		merge = (btspool2 != NULL);
	IndexTuple	itup,
				itup2 = NULL;
	bool		load1;
	TupleDesc	tupdes = RelationGetDescr(wstate->index);
	int			i,
				keysz = RelationGetNumberOfAttributes(wstate->index);
	ScanKey		indexScanKey = NULL;
	SortSupport sortKeys;

	if (merge)
	{
		/*
		 * Another BTSpool for dead tuples exists. Now we have to merge
		 * btspool and btspool2.
		 */

		/* the preparation of merge */
		itup = tuplesort_getindextuple(btspool->sortstate, true);
		itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
		indexScanKey = _bt_mkscankey_nodata(wstate->index);

		/* Prepare SortSupport data for each column */
		sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));

		for (i = 0; i < keysz; i++)
		{
			SortSupport sortKey = sortKeys + i;
			ScanKey		scanKey = indexScanKey + i;
			int16		strategy;

			sortKey->ssup_cxt = CurrentMemoryContext;
			sortKey->ssup_collation = scanKey->sk_collation;
			sortKey->ssup_nulls_first =
				(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
			sortKey->ssup_attno = scanKey->sk_attno;
			/* Abbreviation is not supported here */
			sortKey->abbreviate = false;

			AssertState(sortKey->ssup_attno != 0);

			strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
				BTGreaterStrategyNumber : BTLessStrategyNumber;

			PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
		}

		_bt_freeskey(indexScanKey);

		for (;;)
		{
			load1 = true;		/* load BTSpool next ? */
			if (itup2 == NULL)
			{
				if (itup == NULL)
					break;
			}
			else if (itup != NULL)
			{
				for (i = 1; i <= keysz; i++)
				{
					SortSupport entry;
					Datum		attrDatum1,
								attrDatum2;
					bool		isNull1,
								isNull2;
					int32		compare;

					entry = sortKeys + i - 1;
					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
					attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);

					compare = ApplySortComparator(attrDatum1, isNull1,
												  attrDatum2, isNull2,
												  entry);
					if (compare > 0)
					{
						load1 = false;
						break;
					}
					else if (compare < 0)
						break;
				}
			}
			else
				load1 = false;

			/* When we see first tuple, create first index page */
			if (state == NULL)
				state = _bt_pagestate(wstate, 0);

			if (load1)
			{
				_bt_buildadd(wstate, state, itup);
				itup = tuplesort_getindextuple(btspool->sortstate, true);
			}
			else
			{
				_bt_buildadd(wstate, state, itup2);
				itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
			}
		}
		pfree(sortKeys);
	}
	else
	{
		/* merge is unnecessary */
		while ((itup = tuplesort_getindextuple(btspool->sortstate,
											   true)) != NULL)
		{
			/* When we see first tuple, create first index page */
			if (state == NULL)
				state = _bt_pagestate(wstate, 0);

			_bt_buildadd(wstate, state, itup);
		}
	}

	/* Close down final pages and write the metapage */
	_bt_uppershutdown(wstate, state);

	/*
	 * If the index is WAL-logged, we must fsync it down to disk before it's
	 * safe to commit the transaction.  (For a non-WAL-logged index we don't
	 * care since the index will be uninteresting after a crash anyway.)
	 *
	 * It's obvious that we must do this when not WAL-logging the build. It's
	 * less obvious that we have to do it even if we did WAL-log the index
	 * pages.  The reason is that since we're building outside shared buffers,
	 * a CHECKPOINT occurring during the build has no way to flush the
	 * previously written data to disk (indeed it won't know the index even
	 * exists).  A crash later on would replay WAL from the checkpoint,
	 * therefore it wouldn't replay our earlier WAL entries. If we do not
	 * fsync those pages here, they might still not be on disk when the crash
	 * occurs.
	 */
	if (RelationNeedsWAL(wstate->index))
	{
		RelationOpenSmgr(wstate->index);
		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
	}
}
Example #19
0
/*
 * Insert a tuple to the new relation.	This has to track heap_insert
 * and its subsidiary functions!
 *
 * t_self of the tuple is set to the new TID of the tuple. If t_ctid of the
 * tuple is invalid on entry, it's replaced with the new TID as well (in
 * the inserted data only, not in the caller's copy).
 */
static void
raw_heap_insert(RewriteState state, HeapTuple tup)
{
	Page		page = state->rs_buffer;
	Size		pageFreeSpace,
				saveFreeSpace;
	Size		len;
	OffsetNumber newoff;
	HeapTuple	heaptup;

	/*
	 * If the new tuple is too big for storage or contains already toasted
	 * out-of-line attributes from some other relation, invoke the toaster.
	 *
	 * Note: below this point, heaptup is the data we actually intend to store
	 * into the relation; tup is the caller's original untoasted data.
	 */
	if (state->rs_new_rel->rd_rel->relkind == RELKIND_TOASTVALUE)
	{
		/* toast table entries should never be recursively toasted */
		Assert(!HeapTupleHasExternal(tup));
		heaptup = tup;
	}
	else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
		heaptup = toast_insert_or_update(state->rs_new_rel, tup, NULL,
										 HEAP_INSERT_SKIP_FSM |
										 (state->rs_use_wal ?
										  0 : HEAP_INSERT_SKIP_WAL));
	else
		heaptup = tup;

	len = MAXALIGN(heaptup->t_len);		/* be conservative */

	/*
	 * If we're gonna fail for oversize tuple, do it right away
	 */
	if (len > MaxHeapTupleSize)
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("row is too big: size %zu, maximum size %zu",
						len, MaxHeapTupleSize)));

	/* Compute desired extra freespace due to fillfactor option */
	saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel,
												   HEAP_DEFAULT_FILLFACTOR);

	/* Now we can check to see if there's enough free space already. */
	if (state->rs_buffer_valid)
	{
		pageFreeSpace = PageGetHeapFreeSpace(page);

		if (len + saveFreeSpace > pageFreeSpace)
		{
			/* Doesn't fit, so write out the existing page */

			/* XLOG stuff */
			if (state->rs_use_wal)
				log_newpage(&state->rs_new_rel->rd_node,
							MAIN_FORKNUM,
							state->rs_blockno,
							page,
							true);

			/*
			 * Now write the page. We say isTemp = true even if it's not a
			 * temp table, because there's no need for smgr to schedule an
			 * fsync for this write; we'll do it ourselves in
			 * end_heap_rewrite.
			 */
			RelationOpenSmgr(state->rs_new_rel);

			PageSetChecksumInplace(page, state->rs_blockno);

			smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM,
					   state->rs_blockno, (char *) page, true);

			state->rs_blockno++;
			state->rs_buffer_valid = false;
		}
	}

	if (!state->rs_buffer_valid)
	{
		/* Initialize a new empty page */
		PageInit(page, BLCKSZ, 0);
		state->rs_buffer_valid = true;
	}

	/* And now we can insert the tuple into the page */
	newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len,
						 InvalidOffsetNumber, false, true);
	if (newoff == InvalidOffsetNumber)
		elog(ERROR, "failed to add tuple");

	/* Update caller's t_self to the actual position where it was stored */
	ItemPointerSet(&(tup->t_self), state->rs_blockno, newoff);

	/*
	 * Insert the correct position into CTID of the stored tuple, too, if the
	 * caller didn't supply a valid CTID.
	 */
	if (!ItemPointerIsValid(&tup->t_data->t_ctid))
	{
		ItemId		newitemid;
		HeapTupleHeader onpage_tup;

		newitemid = PageGetItemId(page, newoff);
		onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid);

		onpage_tup->t_ctid = tup->t_self;
	}

	/* If heaptup is a private copy, release it. */
	if (heaptup != tup)
		heap_freetuple(heaptup);
}
Example #20
0
File: nbtsort.c Project: huor/gpdb
/*
 * emit a completed btree page, and release the working storage.
 */
static void
_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
{
	// Fetch gp_persistent_relation_node information that will be added to XLOG record.
	RelationFetchGpRelationNodeForXLog(wstate->index);

	/* Ensure rd_smgr is open (could have been closed by relcache flush!) */
	RelationOpenSmgr(wstate->index);

	/* XLOG stuff */
	if (wstate->btws_use_wal)
	{
		_bt_lognewpage(wstate->index, page, blkno);
	}

	else
	{
		/* Leave the page LSN zero if not WAL-logged, but set TLI anyway */
		PageSetTLI(page, ThisTimeLineID);
	}

	/*
	 * If we have to write pages nonsequentially, fill in the space with
	 * zeroes until we come back and overwrite.  This is not logically
	 * necessary on standard Unix filesystems (unwritten space will read as
	 * zeroes anyway), but it should help to avoid fragmentation. The dummy
	 * pages aren't WAL-logged though.
	 */
	while (blkno > wstate->btws_pages_written)
	{
		if (!wstate->btws_zeropage)
			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);

		// -------- MirroredLock ----------
		// UNDONE: Unfortunately, I think we write temp relations to the mirror...
		LWLockAcquire(MirroredLock, LW_SHARED);

		smgrextend(wstate->index->rd_smgr, wstate->btws_pages_written++,
				   (char *) wstate->btws_zeropage,
				   true);

		LWLockRelease(MirroredLock);
		// -------- MirroredLock ----------
	}

	
	// -------- MirroredLock ----------
	// UNDONE: Unfortunately, I think we write temp relations to the mirror...
	LWLockAcquire(MirroredLock, LW_SHARED);

	/*
	 * Now write the page.	We say isTemp = true even if it's not a temp
	 * index, because there's no need for smgr to schedule an fsync for this
	 * write; we'll do it ourselves before ending the build.
	 */
	if (blkno == wstate->btws_pages_written)
	{
		/* extending the file... */
		smgrextend(wstate->index->rd_smgr, blkno, (char *) page, true);
		wstate->btws_pages_written++;
	}
	else
	{
		/* overwriting a block we zero-filled before */
		smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true);
	}

	LWLockRelease(MirroredLock);
	// -------- MirroredLock ----------

	pfree(page);
}
Example #21
0
/*
 * Open a relation during XLOG replay
 *
 * Note: this once had an API that allowed NULL return on failure, but it
 * no longer does; any failure results in elog().
 */
Relation
XLogOpenRelation(RelFileNode rnode)
{
	XLogRelDesc *res;
	XLogRelCacheEntry *hentry;
	bool		found;

	hentry = (XLogRelCacheEntry *)
		hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL);

	if (hentry)
	{
		res = hentry->rdesc;

		res->lessRecently->moreRecently = res->moreRecently;
		res->moreRecently->lessRecently = res->lessRecently;
	}
	else
	{
		res = _xl_new_reldesc();

		sprintf(RelationGetRelationName(&(res->reldata)), "%u", rnode.relNode);

		res->reldata.rd_node = rnode;

		/*
		 * We set up the lockRelId in case anything tries to lock the dummy
		 * relation.  Note that this is fairly bogus since relNode may be
		 * different from the relation's OID.  It shouldn't really matter
		 * though, since we are presumably running by ourselves and can't have
		 * any lock conflicts ...
		 */
		res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode;
		res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode;

		hentry = (XLogRelCacheEntry *)
			hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found);

		if (found)
			elog(PANIC, "xlog relation already present on insert into cache");

		hentry->rdesc = res;

		res->reldata.rd_targblock = InvalidBlockNumber;
		res->reldata.rd_smgr = NULL;
		RelationOpenSmgr(&(res->reldata));

		/*
		 * Create the target file if it doesn't already exist.  This lets us
		 * cope if the replay sequence contains writes to a relation that is
		 * later deleted.  (The original coding of this routine would instead
		 * return NULL, causing the writes to be suppressed. But that seems
		 * like it risks losing valuable data if the filesystem loses an inode
		 * during a crash.	Better to write the data until we are actually
		 * told to delete the file.)
		 */
		smgrcreate(res->reldata.rd_smgr, res->reldata.rd_istemp, true);
	}

	res->moreRecently = &(_xlrelarr[0]);
	res->lessRecently = _xlrelarr[0].lessRecently;
	_xlrelarr[0].lessRecently = res;
	res->lessRecently->moreRecently = res;

	return &(res->reldata);
}
Example #22
0
/*
 * FreeSpaceMapTruncateRel - adjust for truncation of a relation.
 *
 * The caller must hold AccessExclusiveLock on the relation, to ensure that
 * other backends receive the smgr invalidation event that this function sends
 * before they access the FSM again.
 *
 * nblocks is the new size of the heap.
 */
void
FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
{
	BlockNumber new_nfsmblocks;
	FSMAddress	first_removed_address;
	uint16		first_removed_slot;
	Buffer		buf;

	RelationOpenSmgr(rel);

	/*
	 * If no FSM has been created yet for this relation, there's nothing to
	 * truncate.
	 */
	if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
		return;

	/* Get the location in the FSM of the first removed heap block */
	first_removed_address = fsm_get_location(nblocks, &first_removed_slot);

	/*
	 * Zero out the tail of the last remaining FSM page. If the slot
	 * representing the first removed heap block is at a page boundary, as the
	 * first slot on the FSM page that first_removed_address points to, we can
	 * just truncate that page altogether.
	 */
	if (first_removed_slot > 0)
	{
		buf = fsm_readbuf(rel, first_removed_address, false);
		if (!BufferIsValid(buf))
			return;				/* nothing to do; the FSM was already smaller */
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);

		/* NO EREPORT(ERROR) from here till changes are logged */
		START_CRIT_SECTION();

		fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);

		/*
		 * Truncation of a relation is WAL-logged at a higher-level, and we
		 * will be called at WAL replay. But if checksums are enabled, we need
		 * to still write a WAL record to protect against a torn page, if the
		 * page is flushed to disk before the truncation WAL record. We cannot
		 * use MarkBufferDirtyHint here, because that will not dirty the page
		 * during recovery.
		 */
		MarkBufferDirty(buf);
		if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
			log_newpage_buffer(buf, false);

		END_CRIT_SECTION();

		UnlockReleaseBuffer(buf);

		new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
	}
	else
	{
		new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
		if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
			return;				/* nothing to do; the FSM was already smaller */
	}

	/* Truncate the unused FSM pages, and send smgr inval message */
	smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks);

	/*
	 * We might as well update the local smgr_fsm_nblocks setting.
	 * smgrtruncate sent an smgr cache inval message, which will cause other
	 * backends to invalidate their copy of smgr_fsm_nblocks, and this one too
	 * at the next command boundary.  But this ensures it isn't outright wrong
	 * until then.
	 */
	if (rel->rd_smgr)
		rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks;

	/*
	 * Update upper-level FSM pages to account for the truncation.  This is
	 * important because the just-truncated pages were likely marked as
	 * all-free, and would be preferentially selected.
	 */
	FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
}
Example #23
0
/*
 * Read a FSM page.
 *
 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
 * true, the FSM file is extended.
 */
static Buffer
fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
{
	BlockNumber blkno = fsm_logical_to_physical(addr);
	Buffer		buf;

	RelationOpenSmgr(rel);

	/*
	 * If we haven't cached the size of the FSM yet, check it first.  Also
	 * recheck if the requested block seems to be past end, since our cached
	 * value might be stale.  (We send smgr inval messages on truncation, but
	 * not on extension.)
	 */
	if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber ||
		blkno >= rel->rd_smgr->smgr_fsm_nblocks)
	{
		if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
			rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr,
														 FSM_FORKNUM);
		else
			rel->rd_smgr->smgr_fsm_nblocks = 0;
	}

	/* Handle requests beyond EOF */
	if (blkno >= rel->rd_smgr->smgr_fsm_nblocks)
	{
		if (extend)
			fsm_extend(rel, blkno + 1);
		else
			return InvalidBuffer;
	}

	/*
	 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
	 * information is not accurate anyway, so it's better to clear corrupt
	 * pages than error out. Since the FSM changes are not WAL-logged, the
	 * so-called torn page problem on crash can lead to pages with corrupt
	 * headers, for example.
	 *
	 * The initialize-the-page part is trickier than it looks, because of the
	 * possibility of multiple backends doing this concurrently, and our
	 * desire to not uselessly take the buffer lock in the normal path where
	 * the page is OK.  We must take the lock to initialize the page, so
	 * recheck page newness after we have the lock, in case someone else
	 * already did it.  Also, because we initially check PageIsNew with no
	 * lock, it's possible to fall through and return the buffer while someone
	 * else is still initializing the page (i.e., we might see pd_upper as set
	 * but other page header fields are still zeroes).  This is harmless for
	 * callers that will take a buffer lock themselves, but some callers
	 * inspect the page without any lock at all.  The latter is OK only so
	 * long as it doesn't depend on the page header having correct contents.
	 * Current usage is safe because PageGetContents() does not require that.
	 */
	buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
	if (PageIsNew(BufferGetPage(buf)))
	{
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
		if (PageIsNew(BufferGetPage(buf)))
			PageInit(BufferGetPage(buf), BLCKSZ, 0);
		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
	}
	return buf;
}
Example #24
0
/*
 * _bt_mergeload - Merge two streams of index tuples into new index files.
 */
static void
_bt_mergeload(Spooler *self, BTWriteState *wstate, BTSpool *btspool, BTReader *btspool2, Relation heapRel)
{
	BTPageState	   *state = NULL;
	IndexTuple		itup,
					itup2;
	bool			should_free = false;
	TupleDesc		tupdes = RelationGetDescr(wstate->index);
	int				keysz = RelationGetNumberOfAttributes(wstate->index);
	ScanKey			indexScanKey;
	ON_DUPLICATE	on_duplicate = self->on_duplicate;

	Assert(btspool != NULL);

	/* the preparation of merge */
	itup = BTSpoolGetNextItem(btspool, NULL, &should_free);
	itup2 = BTReaderGetNextItem(btspool2);
	indexScanKey = _bt_mkscankey_nodata(wstate->index);

	for (;;)
	{
		bool	load1 = true;		/* load BTSpool next ? */
		bool	hasnull;
		int32	compare;

		if (self->dup_old + self->dup_new > self->max_dup_errors)
			ereport(ERROR,
					(errcode(ERRCODE_INTERNAL_ERROR),
					 errmsg("Maximum duplicate error count exceeded")));

		if (itup2 == NULL)
		{
			if (itup == NULL)
				break;
		}
		else if (itup != NULL)
		{
			compare = compare_indextuple(itup, itup2, indexScanKey,
										 keysz, tupdes, &hasnull);

			if (compare == 0 && !hasnull && btspool->isunique)
			{
				ItemPointerData t_tid2;

				/*
				 * t_tid is update by heap_is_visible(), because use it for an
				 * index, t_tid backup
				 */
				ItemPointerCopy(&itup2->t_tid, &t_tid2);

				/* The tuple pointed by the old index should not be visible. */
				if (!heap_is_visible(heapRel, &itup->t_tid))
				{
					itup = BTSpoolGetNextItem(btspool, itup, &should_free);
				}
				else if (!heap_is_visible(heapRel, &itup2->t_tid))
				{
					itup2 = BTReaderGetNextItem(btspool2);
				}
				else
				{
					if (on_duplicate == ON_DUPLICATE_KEEP_NEW)
					{
						self->dup_old++;
						remove_duplicate(self, heapRel, itup2,
							RelationGetRelationName(wstate->index));
						itup2 = BTReaderGetNextItem(btspool2);
					}
					else
					{
						ItemPointerCopy(&t_tid2, &itup2->t_tid);
						self->dup_new++;
						remove_duplicate(self, heapRel, itup,
							RelationGetRelationName(wstate->index));
						itup = BTSpoolGetNextItem(btspool, itup, &should_free);
					}
				}

				continue;
			}
			else if (compare > 0)
				load1 = false;
		}
		else
			load1 = false;

		BULKLOAD_PROFILE(&prof_merge_unique);

		/* When we see first tuple, create first index page */
		if (state == NULL)
			state = _bt_pagestate(wstate, 0);

		if (load1)
		{
			IndexTuple	next_itup = NULL;
			bool		next_should_free = false;

			for (;;)
			{
				/* get next item */
				next_itup = BTSpoolGetNextItem(btspool, next_itup,
											   &next_should_free);

				if (!btspool->isunique || next_itup == NULL)
					break;

				compare = compare_indextuple(itup, next_itup, indexScanKey,
											 keysz, tupdes, &hasnull);
				if (compare < 0 || hasnull)
					break;

				if (compare > 0)
				{
					/* shouldn't happen */
					elog(ERROR, "faild in tuplesort_performsort");
				}

				/*
				 * If tupple is deleted by other unique indexes, not visible
				 */
				if (!heap_is_visible(heapRel, &next_itup->t_tid))
				{
					continue;
				}

				if (!heap_is_visible(heapRel, &itup->t_tid))
				{
					if (should_free)
						pfree(itup);

					itup = next_itup;
					should_free = next_should_free;
					next_should_free = false;
					continue;
				}

				/* not unique between input files */
				self->dup_new++;
				remove_duplicate(self, heapRel, next_itup,
								 RelationGetRelationName(wstate->index));

				if (self->dup_old + self->dup_new > self->max_dup_errors)
					ereport(ERROR,
							(errcode(ERRCODE_INTERNAL_ERROR),
							 errmsg("Maximum duplicate error count exceeded")));
			}

			_bt_buildadd(wstate, state, itup);

			if (should_free)
				pfree(itup);

			itup = next_itup;
			should_free = next_should_free;
		}
		else
		{
			_bt_buildadd(wstate, state, itup2);
			itup2 = BTReaderGetNextItem(btspool2);
		}
		BULKLOAD_PROFILE(&prof_merge_insert);
	}
	_bt_freeskey(indexScanKey);

	/* Close down final pages and write the metapage */
	_bt_uppershutdown(wstate, state);

	/*
	 * If the index isn't temp, we must fsync it down to disk before it's safe
	 * to commit the transaction.  (For a temp index we don't care since the
	 * index will be uninteresting after a crash anyway.)
	 *
	 * It's obvious that we must do this when not WAL-logging the build. It's
	 * less obvious that we have to do it even if we did WAL-log the index
	 * pages.  The reason is that since we're building outside shared buffers,
	 * a CHECKPOINT occurring during the build has no way to flush the
	 * previously written data to disk (indeed it won't know the index even
	 * exists).  A crash later on would replay WAL from the checkpoint,
	 * therefore it wouldn't replay our earlier WAL entries. If we do not
	 * fsync those pages here, they might still not be on disk when the crash
	 * occurs.
	 */
	if (!RELATION_IS_LOCAL(wstate->index))
	{
		RelationOpenSmgr(wstate->index);
		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
	}
	BULKLOAD_PROFILE(&prof_merge_term);
}
Example #25
0
/*
 * Open a relation during XLOG replay
 *
 * Note: this once had an API that allowed NULL return on failure, but it
 * no longer does; any failure results in elog().
 */
Relation
XLogOpenRelation(RelFileNode rnode)
{
	XLogRelDesc *res;
	XLogRelCacheEntry *hentry;
	bool		found;

	hentry = (XLogRelCacheEntry *)
		hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL);

	if (hentry)
	{
		res = hentry->rdesc;

		res->lessRecently->moreRecently = res->moreRecently;
		res->moreRecently->lessRecently = res->lessRecently;
	}
	else
	{
		/*
		 * We need to fault in the database directory on the standby.
		 */
		if (rnode.spcNode != GLOBALTABLESPACE_OID && IsStandbyMode())
		{
			char *primaryFilespaceLocation = NULL;

			char *dbPath;
			
			if (IsBuiltinTablespace(rnode.spcNode))
			{
				/*
				 * No filespace to fetch.
				 */
			}
			else
			{		
				char *mirrorFilespaceLocation = NULL;
			
				/*
				 * Investigate whether the containing directories exist to give more detail.
				 */
				PersistentTablespace_GetPrimaryAndMirrorFilespaces(
													rnode.spcNode,
													&primaryFilespaceLocation,
													&mirrorFilespaceLocation);
				if (primaryFilespaceLocation == NULL ||
					strlen(primaryFilespaceLocation) == 0)
				{
					elog(ERROR, "Empty primary filespace directory location");
				}
			
				if (mirrorFilespaceLocation != NULL)
				{
					pfree(mirrorFilespaceLocation);
					mirrorFilespaceLocation = NULL;
				}
			}
			
			dbPath = (char*)palloc(MAXPGPATH + 1);
			
			FormDatabasePath(
						dbPath,
						primaryFilespaceLocation,
						rnode.spcNode,
						rnode.dbNode);

			if (primaryFilespaceLocation != NULL)
			{
				pfree(primaryFilespaceLocation);
				primaryFilespaceLocation = NULL;
			}
			
			if (mkdir(dbPath, 0700) == 0)
			{
				if (Debug_persistent_recovery_print)
				{
					elog(PersistentRecovery_DebugPrintLevel(), 
						 "XLogOpenRelation: Re-created database directory \"%s\"",
						 dbPath);
				}
			}
			else
			{
				/*
				 * Allowed to already exist.
				 */
				if (errno != EEXIST)
				{
					elog(ERROR, "could not create database directory \"%s\": %m",
						 dbPath);
				}
				else
				{
					if (Debug_persistent_recovery_print)
					{
						elog(PersistentRecovery_DebugPrintLevel(), 
							 "XLogOpenRelation: Database directory \"%s\" already exists",
							 dbPath);
					}
				}
			}

			pfree(dbPath);
		}
		
		res = _xl_new_reldesc();

		sprintf(RelationGetRelationName(&(res->reldata)), "%u", rnode.relNode);

		res->reldata.rd_node = rnode;

		/*
		 * We set up the lockRelId in case anything tries to lock the dummy
		 * relation.  Note that this is fairly bogus since relNode may be
		 * different from the relation's OID.  It shouldn't really matter
		 * though, since we are presumably running by ourselves and can't have
		 * any lock conflicts ...
		 */
		res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode;
		res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode;

		hentry = (XLogRelCacheEntry *)
			hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found);

		if (found)
			elog(PANIC, "xlog relation already present on insert into cache");

		hentry->rdesc = res;

		res->reldata.rd_targblock = InvalidBlockNumber;
		res->reldata.rd_smgr = NULL;
		RelationOpenSmgr(&(res->reldata));

		/*
		 * Create the target file if it doesn't already exist.  This lets us
		 * cope if the replay sequence contains writes to a relation that is
		 * later deleted.  (The original coding of this routine would instead
		 * return NULL, causing the writes to be suppressed. But that seems
		 * like it risks losing valuable data if the filesystem loses an inode
		 * during a crash.	Better to write the data until we are actually
		 * told to delete the file.)
		 */
		// NOTE: We no longer re-create files automatically because
		// new FileRep persistent objects will ensure files exist.

		// UNDONE: Can't remove this block of code yet until boot time calls to this routine are analyzed...
		{
			MirrorDataLossTrackingState mirrorDataLossTrackingState;
			int64 mirrorDataLossTrackingSessionNum;
			bool mirrorDataLossOccurred;
			
			// UNDONE: What about the persistent rel files table???
			// UNDONE: This condition should not occur anymore.
			// UNDONE: segmentFileNum and AO?
			mirrorDataLossTrackingState = 
						FileRepPrimary_GetMirrorDataLossTrackingSessionNum(
														&mirrorDataLossTrackingSessionNum);
			smgrcreate(
				res->reldata.rd_smgr, 
				res->reldata.rd_isLocalBuf, 
				/* relationName */ NULL,		// Ok to be NULL -- we don't know the name here.
				mirrorDataLossTrackingState,
				mirrorDataLossTrackingSessionNum,
				/* ignoreAlreadyExists */ true,
				&mirrorDataLossOccurred);
			
		}
	}

	res->moreRecently = &(_xlrelarr[0]);
	res->lessRecently = _xlrelarr[0].lessRecently;
	_xlrelarr[0].lessRecently = res;
	res->lessRecently->moreRecently = res;

	Assert(&(res->reldata) != NULL);	// Assert what it says in the interface -- we don't return NULL anymore.

	return &(res->reldata);
}
Example #26
0
/*
 * RelationTruncate
 *		Physically truncate a relation to the specified number of blocks.
 *
 * This includes getting rid of any buffers for the blocks that are to be
 * dropped.
 */
void
RelationTruncate(Relation rel, BlockNumber nblocks)
{
	bool		fsm;
	bool		vm;

	/* Open it at the smgr level if not already done */
	RelationOpenSmgr(rel);

	/*
	 * Make sure smgr_targblock etc aren't pointing somewhere past new end
	 */
	rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
	rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;

	/* Truncate the FSM first if it exists */
	fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
	if (fsm)
		FreeSpaceMapTruncateRel(rel, nblocks);

	/* Truncate the visibility map too if it exists. */
	vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
	if (vm)
		visibilitymap_truncate(rel, nblocks);

	/*
	 * We WAL-log the truncation before actually truncating, which means
	 * trouble if the truncation fails. If we then crash, the WAL replay
	 * likely isn't going to succeed in the truncation either, and cause a
	 * PANIC. It's tempting to put a critical section here, but that cure
	 * would be worse than the disease. It would turn a usually harmless
	 * failure to truncate, that might spell trouble at WAL replay, into a
	 * certain PANIC.
	 */
	if (!rel->rd_istemp)
	{
		/*
		 * Make an XLOG entry reporting the file truncation.
		 */
		XLogRecPtr	lsn;
		XLogRecData rdata;
		xl_smgr_truncate xlrec;

		xlrec.blkno = nblocks;
		xlrec.rnode = rel->rd_node;

		rdata.data = (char *) &xlrec;
		rdata.len = sizeof(xlrec);
		rdata.buffer = InvalidBuffer;
		rdata.next = NULL;

		lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE, &rdata);

		/*
		 * Flush, because otherwise the truncation of the main relation might
		 * hit the disk before the WAL record, and the truncation of the FSM
		 * or visibility map. If we crashed during that window, we'd be left
		 * with a truncated heap, but the FSM or visibility map would still
		 * contain entries for the non-existent heap pages.
		 */
		if (fsm || vm)
			XLogFlush(lsn);
	}

	/* Do the real work */
	smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks, rel->rd_istemp);
}
Example #27
0
/*
 * Read tuples in correct sort order from tuplesort, and load them into
 * btree leaves.
 */
static void
_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
{
	BTPageState *state = NULL;
	bool		merge = (btspool2 != NULL);
	IndexTuple	itup,
				itup2 = NULL;
	bool		should_free,
				should_free2,
				load1;
	TupleDesc	tupdes = RelationGetDescr(wstate->index);
	int			i,
				keysz = RelationGetNumberOfAttributes(wstate->index);
	ScanKey		indexScanKey = NULL;

	if (merge)
	{
		/*
		 * Another BTSpool for dead tuples exists. Now we have to merge
		 * btspool and btspool2.
		 */

		/* the preparation of merge */
		itup = tuplesort_getindextuple(btspool->sortstate,
									   true, &should_free);
		itup2 = tuplesort_getindextuple(btspool2->sortstate,
										true, &should_free2);
		indexScanKey = _bt_mkscankey_nodata(wstate->index);

		for (;;)
		{
			load1 = true;		/* load BTSpool next ? */
			if (itup2 == NULL)
			{
				if (itup == NULL)
					break;
			}
			else if (itup != NULL)
			{
				for (i = 1; i <= keysz; i++)
				{
					ScanKey		entry;
					Datum		attrDatum1,
								attrDatum2;
					bool		isNull1,
								isNull2;
					int32		compare;

					entry = indexScanKey + i - 1;
					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
					attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
					if (isNull1)
					{
						if (isNull2)
							compare = 0;		/* NULL "=" NULL */
						else if (entry->sk_flags & SK_BT_NULLS_FIRST)
							compare = -1;		/* NULL "<" NOT_NULL */
						else
							compare = 1;		/* NULL ">" NOT_NULL */
					}
					else if (isNull2)
					{
						if (entry->sk_flags & SK_BT_NULLS_FIRST)
							compare = 1;		/* NOT_NULL ">" NULL */
						else
							compare = -1;		/* NOT_NULL "<" NULL */
					}
					else
					{
						compare =
							DatumGetInt32(FunctionCall2Coll(&entry->sk_func,
														 entry->sk_collation,
															attrDatum1,
															attrDatum2));

						if (entry->sk_flags & SK_BT_DESC)
							compare = -compare;
					}
					if (compare > 0)
					{
						load1 = false;
						break;
					}
					else if (compare < 0)
						break;
				}
			}
			else
				load1 = false;

			/* When we see first tuple, create first index page */
			if (state == NULL)
				state = _bt_pagestate(wstate, 0);

			if (load1)
			{
				_bt_buildadd(wstate, state, itup);
				if (should_free)
					pfree(itup);
				itup = tuplesort_getindextuple(btspool->sortstate,
											   true, &should_free);
			}
			else
			{
				_bt_buildadd(wstate, state, itup2);
				if (should_free2)
					pfree(itup2);
				itup2 = tuplesort_getindextuple(btspool2->sortstate,
												true, &should_free2);
			}
		}
		_bt_freeskey(indexScanKey);
	}
	else
	{
		/* merge is unnecessary */
		while ((itup = tuplesort_getindextuple(btspool->sortstate,
											   true, &should_free)) != NULL)
		{
			/* When we see first tuple, create first index page */
			if (state == NULL)
				state = _bt_pagestate(wstate, 0);

			_bt_buildadd(wstate, state, itup);
			if (should_free)
				pfree(itup);
		}
	}

	/* Close down final pages and write the metapage */
	_bt_uppershutdown(wstate, state);

	/*
	 * If the index is WAL-logged, we must fsync it down to disk before it's
	 * safe to commit the transaction.	(For a non-WAL-logged index we don't
	 * care since the index will be uninteresting after a crash anyway.)
	 *
	 * It's obvious that we must do this when not WAL-logging the build. It's
	 * less obvious that we have to do it even if we did WAL-log the index
	 * pages.  The reason is that since we're building outside shared buffers,
	 * a CHECKPOINT occurring during the build has no way to flush the
	 * previously written data to disk (indeed it won't know the index even
	 * exists).  A crash later on would replay WAL from the checkpoint,
	 * therefore it wouldn't replay our earlier WAL entries. If we do not
	 * fsync those pages here, they might still not be on disk when the crash
	 * occurs.
	 */
	if (RelationNeedsWAL(wstate->index))
	{
		RelationOpenSmgr(wstate->index);
		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
	}
}
Example #28
0
/*
 * pg_prewarm(regclass, mode text, fork text,
 *			  first_block int8, last_block int8)
 *
 * The first argument is the relation to be prewarmed; the second controls
 * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'.
 * The third is the name of the relation fork to be prewarmed.  The fourth
 * and fifth arguments specify the first and last block to be prewarmed.
 * If the fourth argument is NULL, it will be taken as 0; if the fifth argument
 * is NULL, it will be taken as the number of blocks in the relation.  The
 * return value is the number of blocks successfully prewarmed.
 */
Datum
pg_prewarm(PG_FUNCTION_ARGS)
{
	Oid			relOid;
	text	   *forkName;
	text	   *type;
	int64		first_block;
	int64		last_block;
	int64		nblocks;
	int64		blocks_done = 0;
	int64		block;
	Relation	rel;
	ForkNumber	forkNumber;
	char	   *forkString;
	char	   *ttype;
	PrewarmType ptype;
	AclResult	aclresult;

	/* Basic sanity checking. */
	if (PG_ARGISNULL(0))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("relation cannot be null")));
	relOid = PG_GETARG_OID(0);
	if (PG_ARGISNULL(1))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 (errmsg("prewarm type cannot be null"))));
	type = PG_GETARG_TEXT_P(1);
	ttype = text_to_cstring(type);
	if (strcmp(ttype, "prefetch") == 0)
		ptype = PREWARM_PREFETCH;
	else if (strcmp(ttype, "read") == 0)
		ptype = PREWARM_READ;
	else if (strcmp(ttype, "buffer") == 0)
		ptype = PREWARM_BUFFER;
	else
	{
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("invalid prewarm type"),
				 errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\".")));
		PG_RETURN_INT64(0);		/* Placate compiler. */
	}
	if (PG_ARGISNULL(2))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 (errmsg("relation fork cannot be null"))));
	forkName = PG_GETARG_TEXT_P(2);
	forkString = text_to_cstring(forkName);
	forkNumber = forkname_to_number(forkString);

	/* Open relation and check privileges. */
	rel = relation_open(relOid, AccessShareLock);
	aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT);
	if (aclresult != ACLCHECK_OK)
		aclcheck_error(aclresult, ACL_KIND_CLASS, get_rel_name(relOid));

	/* Check that the fork exists. */
	RelationOpenSmgr(rel);
	if (!smgrexists(rel->rd_smgr, forkNumber))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("fork \"%s\" does not exist for this relation",
						forkString)));

	/* Validate block numbers, or handle nulls. */
	nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber);
	if (PG_ARGISNULL(3))
		first_block = 0;
	else
	{
		first_block = PG_GETARG_INT64(3);
		if (first_block < 0 || first_block >= nblocks)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("starting block number must be between 0 and " INT64_FORMAT,
							nblocks - 1)));
	}
	if (PG_ARGISNULL(4))
		last_block = nblocks - 1;
	else
	{
		last_block = PG_GETARG_INT64(4);
		if (last_block < 0 || last_block >= nblocks)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
			errmsg("ending block number must be between 0 and " INT64_FORMAT,
				   nblocks - 1)));
	}

	/* Now we're ready to do the real work. */
	if (ptype == PREWARM_PREFETCH)
	{
#ifdef USE_PREFETCH

		/*
		 * In prefetch mode, we just hint the OS to read the blocks, but we
		 * don't know whether it really does it, and we don't wait for it to
		 * finish.
		 *
		 * It would probably be better to pass our prefetch requests in chunks
		 * of a megabyte or maybe even a whole segment at a time, but there's
		 * no practical way to do that at present without a gross modularity
		 * violation, so we just do this.
		 */
		for (block = first_block; block <= last_block; ++block)
		{
			CHECK_FOR_INTERRUPTS();
			PrefetchBuffer(rel, forkNumber, block);
			++blocks_done;
		}
#else
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("prefetch is not supported by this build")));
#endif
	}
	else if (ptype == PREWARM_READ)
	{
		/*
		 * In read mode, we actually read the blocks, but not into shared
		 * buffers.  This is more portable than prefetch mode (it works
		 * everywhere) and is synchronous.
		 */
		for (block = first_block; block <= last_block; ++block)
		{
			CHECK_FOR_INTERRUPTS();
			smgrread(rel->rd_smgr, forkNumber, block, blockbuffer);
			++blocks_done;
		}
	}
	else if (ptype == PREWARM_BUFFER)
	{
		/*
		 * In buffer mode, we actually pull the data into shared_buffers.
		 */
		for (block = first_block; block <= last_block; ++block)
		{
			Buffer		buf;

			CHECK_FOR_INTERRUPTS();
			buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL);
			ReleaseBuffer(buf);
			++blocks_done;
		}
	}

	/* Close relation, release lock. */
	relation_close(rel, AccessShareLock);

	PG_RETURN_INT64(blocks_done);
}
Example #29
0
Relation DirectOpen_Open(
	DirectOpen *direct,

	Oid relationId,

	Oid tablespace,

	Oid database,

	Oid relfilenode,

	FormData_pg_class *pgClass,

	FormData_pg_attribute *attrArray,

	FormData_pg_am *pgAm,

	FormData_pg_index *pgIndex,

	int2 *indKeyArray,

	Oid *indClassArray,

	bool relHasOid)
{
	int natts;
	int i;

	Assert(pgClass != NULL);
	natts = pgClass->relnatts;

	if (relationId == -1)
		relationId = pgClass->relfilenode;		// Assume it is ok to use the relfilenode as the relationId in our limited usage.

	if (relfilenode == -1)
		relfilenode = pgClass->relfilenode;

	if (!direct->isInit)
	{
		/*
		 * Lots of Hard-coded construction of the gp_persistent* RelationS and
		 * dependent objects like tuple descriptors, etc.
		 */

		direct->relationData.rd_refcnt = 0;
		direct->relationData.rd_isvalid = true;

		direct->relationData.rd_id = relationId;

		direct->relationData.rd_rel = pgClass;

		if (pgIndex != NULL)
		{
			int pgIndexFixedLen = offsetof(FormData_pg_index, indkey);
			int indKeyVectorLen = Int2VectorSize(natts);
			int2vector *indKeyVector;
			oidvector  *indClassVector;

			uint16		amstrategies;
			uint16		amsupport;

			Oid 	   *operator;
			RegProcedure *support;
			FmgrInfo   *supportinfo;

			Assert(pgAm != NULL);
			Assert(indKeyArray != NULL);
			Assert(indClassArray != NULL);

			/*
			 * Allocate Formdata_pg_index with fields through indkey
			 * where indkey is a variable length int2vector with indKeyArray values.
			 */
			direct->relationData.rd_index = 
						(FormData_pg_index*)palloc(
								pgIndexFixedLen + indKeyVectorLen);
			memcpy(direct->relationData.rd_index, pgIndex, pgIndexFixedLen);

			indKeyVector = buildint2vector(
									indKeyArray,
									natts);
			memcpy(
				&direct->relationData.rd_index->indkey, 
				indKeyVector, 
				indKeyVectorLen);

			pfree(indKeyVector);

			direct->relationData.rd_am = pgAm;

			amstrategies = pgAm->amstrategies;
			amsupport = pgAm->amsupport;

			direct->relationData.rd_indexcxt = TopMemoryContext;

			/*
			 * Allocate arrays to hold data
			 */
			direct->relationData.rd_aminfo = (RelationAmInfo *)
				MemoryContextAllocZero(TopMemoryContext, sizeof(RelationAmInfo));

			direct->relationData.rd_opfamily = (Oid *)
				MemoryContextAllocZero(TopMemoryContext, natts * sizeof(Oid));
			direct->relationData.rd_opcintype = (Oid *)
				MemoryContextAllocZero(TopMemoryContext, natts * sizeof(Oid));

			if (amstrategies > 0)
				operator = (Oid *)
					MemoryContextAllocZero(TopMemoryContext,
										   natts * amstrategies * sizeof(Oid));
			else
				operator = NULL;

			if (amsupport > 0)
			{
				int			nsupport = natts * amsupport;

				support = (RegProcedure *)
					MemoryContextAllocZero(TopMemoryContext, nsupport * sizeof(RegProcedure));
				supportinfo = (FmgrInfo *)
					MemoryContextAllocZero(TopMemoryContext, nsupport * sizeof(FmgrInfo));
			}
			else
			{
				support = NULL;
				supportinfo = NULL;
			}

			direct->relationData.rd_operator = operator;
			direct->relationData.rd_support = support;
			direct->relationData.rd_supportinfo = supportinfo;

			direct->relationData.rd_indoption = (int16 *)
				MemoryContextAllocZero(TopMemoryContext, natts * sizeof(int16));

			/*
			 * Create oidvector in rd_indclass with values from indClassArray.
			 */
			indClassVector = buildoidvector(indClassArray, natts);
 
			/*
			 * Fill the operator and support procedure OID arrays.	(aminfo and
			 * supportinfo are left as zeroes, and are filled on-the-fly when used)
			 */
			IndexSupportInitialize(indClassVector,
								   operator, support,
								   direct->relationData.rd_opfamily,
								   direct->relationData.rd_opcintype,
								   amstrategies, amsupport, natts);

			/*
			 * expressions and predicate cache will be filled later.
			 */
			direct->relationData.rd_indexprs = NIL;
			direct->relationData.rd_indpred = NIL;
			direct->relationData.rd_amcache = NULL;		
		}

		// Not much in terms of contraints.
		direct->constrData.has_not_null = true;

		/*
		 * Setup tuple descriptor for columns.
		 */
		direct->descData.natts = pgClass->relnatts;

		// Make the array of pointers.
		direct->descData.attrs = 
				(Form_pg_attribute*)
						MemoryContextAllocZero(
									TopMemoryContext, 
									sizeof(Form_pg_attribute*) * pgClass->relnatts);

		for (i = 0; i < pgClass->relnatts; i++)
		{
			direct->descData.attrs[i] = 
						(Form_pg_attribute)
								MemoryContextAllocZero(
											TopMemoryContext, 
											sizeof(FormData_pg_attribute));

			memcpy(direct->descData.attrs[i], &(attrArray[i]), sizeof(FormData_pg_attribute));

			// Patch up relation id.
			direct->descData.attrs[i]->attrelid = relationId;
		}

		direct->descData.constr = &direct->constrData;
		direct->descData.tdtypeid = pgClass->reltype;
		direct->descData.tdtypmod = -1;
		direct->descData.tdqdtypmod = -1;
		direct->descData.tdhasoid = relHasOid;
		direct->descData.tdrefcount = 1;

		direct->relationData.rd_att = &direct->descData;

		direct->pgStat.t_id = relationId;
		direct->pgStat.t_shared = 1;

		direct->relationData.pgstat_info = &direct->pgStat;

		direct->isInit = true;
	}
	
	// UNDONE: Should verify for NON-SHARED relations we don't open relations in different databases / or
	// UNDONE: open different relations in same database at same time !!!
	direct->relationData.rd_node.spcNode = tablespace;
	direct->relationData.rd_node.dbNode = database;
	direct->relationData.rd_node.relNode = relfilenode;

	direct->relationData.rd_targblock = InvalidBlockNumber;

	for (i = 0; i < direct->relationData.rd_rel->relnatts; i++)
	{
		Assert(direct->descData.attrs[i] != NULL);
		
		// Patch up relation id.
		direct->descData.attrs[i]->attrelid = direct->relationData.rd_id;
	}

	direct->relationData.rd_refcnt++;

	RelationOpenSmgr(&direct->relationData);

	return &direct->relationData;
}