Example #1
0
/*
 *	btbuildempty() -- build an empty btree index in the initialization fork
 */
void
btbuildempty(Relation index)
{
	Page		metapage;

	/* Construct metapage. */
	metapage = (Page) palloc(BLCKSZ);
	_bt_initmetapage(metapage, P_NONE, 0);

	/*
	 * Write the page and log it.  It might seem that an immediate sync would
	 * be sufficient to guarantee that the file exists on disk, but recovery
	 * itself might remove it while replaying, for example, an
	 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
	 * this even when wal_level=minimal.
	 */
	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
	smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
			  (char *) metapage, true);
	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
				BTREE_METAPAGE, metapage, true);

	/*
	 * An immediate sync is required even if we xlog'd the page, because the
	 * write did not go through shared_buffers and therefore a concurrent
	 * checkpoint may have moved the redo pointer past our xlog record.
	 */
	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
}
Example #2
0
/*
 * Build an empty SPGiST index in the initialization fork
 */
Datum
spgbuildempty(PG_FUNCTION_ARGS)
{
	Relation	index = (Relation) PG_GETARG_POINTER(0);
	Page		page;

	/* Construct metapage. */
	page = (Page) palloc(BLCKSZ);
	SpGistInitMetapage(page);

	/* Write the page.	If archiving/streaming, XLOG it. */
	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
			  (char *) page, true);
	if (XLogIsNeeded())
		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
					SPGIST_METAPAGE_BLKNO, page);

	/* Likewise for the root page. */
	SpGistInitPage(page, SPGIST_LEAF);

	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_HEAD_BLKNO,
			  (char *) page, true);
	if (XLogIsNeeded())
		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
					SPGIST_HEAD_BLKNO, page);

	/*
	 * An immediate sync is required even if we xlog'd the pages, because the
	 * writes did not go through shared buffers and therefore a concurrent
	 * checkpoint may have moved the redo pointer past our xlog record.
	 */
	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);

	PG_RETURN_VOID();
}
Example #3
0
/*
 *	btbuildempty() -- build an empty btree index in the initialization fork
 */
Datum
btbuildempty(PG_FUNCTION_ARGS)
{
	Relation	index = (Relation) PG_GETARG_POINTER(0);
	Page		metapage;

	/* Construct metapage. */
	metapage = (Page) palloc(BLCKSZ);
	_bt_initmetapage(metapage, P_NONE, 0);

	/* Write the page.	If archiving/streaming, XLOG it. */
	smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
			  (char *) metapage, true);
	if (XLogIsNeeded())
		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
					BTREE_METAPAGE, metapage);

	/*
	 * An immediate sync is require even if we xlog'd the page, because the
	 * write did not go through shared_buffers and therefore a concurrent
	 * checkpoint may have move the redo pointer past our xlog record.
	 */
	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);

	PG_RETURN_VOID();
}
Example #4
0
/*
 * SetMatViewToPopulated
 *		Indicate that the materialized view has been populated by its query.
 *
 * NOTE: The heap starts out in a state that doesn't look scannable, and can
 * only transition from there to scannable at the time a new heap is created.
 *
 * NOTE: caller must be holding an appropriate lock on the relation.
 */
void
SetMatViewToPopulated(Relation relation)
{
	Page        page;

	Assert(relation->rd_rel->relkind == RELKIND_MATVIEW);
	Assert(relation->rd_ispopulated == false);

	page = (Page) palloc(BLCKSZ);
	PageInit(page, BLCKSZ, 0);

	if (RelationNeedsWAL(relation))
		log_newpage(&(relation->rd_node), MAIN_FORKNUM, 0, page);

	RelationOpenSmgr(relation);

	PageSetChecksumInplace(page, 0);
	smgrextend(relation->rd_smgr, MAIN_FORKNUM, 0, (char *) page, true);

	pfree(page);

	smgrimmedsync(relation->rd_smgr, MAIN_FORKNUM);

	RelationCacheInvalidateEntry(relation->rd_id);
}
Example #5
0
/*
 * Build an empty SPGiST index in the initialization fork
 */
void
spgbuildempty(Relation index)
{
	Page		page;

	/* Construct metapage. */
	page = (Page) palloc(BLCKSZ);
	SpGistInitMetapage(page);

	/* Write the page.  If archiving/streaming, XLOG it. */
	PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
			  (char *) page, true);
	if (XLogIsNeeded())
		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
					SPGIST_METAPAGE_BLKNO, page, false);

	/* Likewise for the root page. */
	SpGistInitPage(page, SPGIST_LEAF);

	PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO);
	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO,
			  (char *) page, true);
	if (XLogIsNeeded())
		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
					SPGIST_ROOT_BLKNO, page, true);

	/* Likewise for the null-tuples root page. */
	SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);

	PageSetChecksumInplace(page, SPGIST_NULL_BLKNO);
	smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO,
			  (char *) page, true);
	if (XLogIsNeeded())
		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
					SPGIST_NULL_BLKNO, page, true);

	/*
	 * An immediate sync is required even if we xlog'd the pages, because the
	 * writes did not go through shared buffers and therefore a concurrent
	 * checkpoint may have moved the redo pointer past our xlog record.
	 */
	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
}
Example #6
0
/*
 *	btbuildempty() -- build an empty btree index in the initialization fork
 */
void
btbuildempty(Relation index)
{
	Page		metapage;

	/* Construct metapage. */
	metapage = (Page) palloc(BLCKSZ);
	_bt_initmetapage(metapage, P_NONE, 0);

	/* Write the page.  If archiving/streaming, XLOG it. */
	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
	smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
			  (char *) metapage, true);
	if (XLogIsNeeded())
		log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
					BTREE_METAPAGE, metapage, false);

	/*
	 * An immediate sync is required even if we xlog'd the page, because the
	 * write did not go through shared_buffers and therefore a concurrent
	 * checkpoint may have moved the redo pointer past our xlog record.
	 */
	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
}
Example #7
0
/*
 * Read tuples in correct sort order from tuplesort, and load them into
 * btree leaves.
 */
static void
_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
{
	BTPageState *state = NULL;
	bool		merge = (btspool2 != NULL);
	IndexTuple	itup,
				itup2 = NULL;
	bool		load1;
	TupleDesc	tupdes = RelationGetDescr(wstate->index);
	int			i,
				keysz = RelationGetNumberOfAttributes(wstate->index);
	ScanKey		indexScanKey = NULL;
	SortSupport sortKeys;

	if (merge)
	{
		/*
		 * Another BTSpool for dead tuples exists. Now we have to merge
		 * btspool and btspool2.
		 */

		/* the preparation of merge */
		itup = tuplesort_getindextuple(btspool->sortstate, true);
		itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
		indexScanKey = _bt_mkscankey_nodata(wstate->index);

		/* Prepare SortSupport data for each column */
		sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));

		for (i = 0; i < keysz; i++)
		{
			SortSupport sortKey = sortKeys + i;
			ScanKey		scanKey = indexScanKey + i;
			int16		strategy;

			sortKey->ssup_cxt = CurrentMemoryContext;
			sortKey->ssup_collation = scanKey->sk_collation;
			sortKey->ssup_nulls_first =
				(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
			sortKey->ssup_attno = scanKey->sk_attno;
			/* Abbreviation is not supported here */
			sortKey->abbreviate = false;

			AssertState(sortKey->ssup_attno != 0);

			strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
				BTGreaterStrategyNumber : BTLessStrategyNumber;

			PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
		}

		_bt_freeskey(indexScanKey);

		for (;;)
		{
			load1 = true;		/* load BTSpool next ? */
			if (itup2 == NULL)
			{
				if (itup == NULL)
					break;
			}
			else if (itup != NULL)
			{
				for (i = 1; i <= keysz; i++)
				{
					SortSupport entry;
					Datum		attrDatum1,
								attrDatum2;
					bool		isNull1,
								isNull2;
					int32		compare;

					entry = sortKeys + i - 1;
					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
					attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);

					compare = ApplySortComparator(attrDatum1, isNull1,
												  attrDatum2, isNull2,
												  entry);
					if (compare > 0)
					{
						load1 = false;
						break;
					}
					else if (compare < 0)
						break;
				}
			}
			else
				load1 = false;

			/* When we see first tuple, create first index page */
			if (state == NULL)
				state = _bt_pagestate(wstate, 0);

			if (load1)
			{
				_bt_buildadd(wstate, state, itup);
				itup = tuplesort_getindextuple(btspool->sortstate, true);
			}
			else
			{
				_bt_buildadd(wstate, state, itup2);
				itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
			}
		}
		pfree(sortKeys);
	}
	else
	{
		/* merge is unnecessary */
		while ((itup = tuplesort_getindextuple(btspool->sortstate,
											   true)) != NULL)
		{
			/* When we see first tuple, create first index page */
			if (state == NULL)
				state = _bt_pagestate(wstate, 0);

			_bt_buildadd(wstate, state, itup);
		}
	}

	/* Close down final pages and write the metapage */
	_bt_uppershutdown(wstate, state);

	/*
	 * If the index is WAL-logged, we must fsync it down to disk before it's
	 * safe to commit the transaction.  (For a non-WAL-logged index we don't
	 * care since the index will be uninteresting after a crash anyway.)
	 *
	 * It's obvious that we must do this when not WAL-logging the build. It's
	 * less obvious that we have to do it even if we did WAL-log the index
	 * pages.  The reason is that since we're building outside shared buffers,
	 * a CHECKPOINT occurring during the build has no way to flush the
	 * previously written data to disk (indeed it won't know the index even
	 * exists).  A crash later on would replay WAL from the checkpoint,
	 * therefore it wouldn't replay our earlier WAL entries. If we do not
	 * fsync those pages here, they might still not be on disk when the crash
	 * occurs.
	 */
	if (RelationNeedsWAL(wstate->index))
	{
		RelationOpenSmgr(wstate->index);
		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
	}
}
Example #8
0
/*
 * _bt_mergeload - Merge two streams of index tuples into new index files.
 */
static void
_bt_mergeload(Spooler *self, BTWriteState *wstate, BTSpool *btspool, BTReader *btspool2, Relation heapRel)
{
	BTPageState	   *state = NULL;
	IndexTuple		itup,
					itup2;
	bool			should_free = false;
	TupleDesc		tupdes = RelationGetDescr(wstate->index);
	int				keysz = RelationGetNumberOfAttributes(wstate->index);
	ScanKey			indexScanKey;
	ON_DUPLICATE	on_duplicate = self->on_duplicate;

	Assert(btspool != NULL);

	/* the preparation of merge */
	itup = BTSpoolGetNextItem(btspool, NULL, &should_free);
	itup2 = BTReaderGetNextItem(btspool2);
	indexScanKey = _bt_mkscankey_nodata(wstate->index);

	for (;;)
	{
		bool	load1 = true;		/* load BTSpool next ? */
		bool	hasnull;
		int32	compare;

		if (self->dup_old + self->dup_new > self->max_dup_errors)
			ereport(ERROR,
					(errcode(ERRCODE_INTERNAL_ERROR),
					 errmsg("Maximum duplicate error count exceeded")));

		if (itup2 == NULL)
		{
			if (itup == NULL)
				break;
		}
		else if (itup != NULL)
		{
			compare = compare_indextuple(itup, itup2, indexScanKey,
										 keysz, tupdes, &hasnull);

			if (compare == 0 && !hasnull && btspool->isunique)
			{
				ItemPointerData t_tid2;

				/*
				 * t_tid is update by heap_is_visible(), because use it for an
				 * index, t_tid backup
				 */
				ItemPointerCopy(&itup2->t_tid, &t_tid2);

				/* The tuple pointed by the old index should not be visible. */
				if (!heap_is_visible(heapRel, &itup->t_tid))
				{
					itup = BTSpoolGetNextItem(btspool, itup, &should_free);
				}
				else if (!heap_is_visible(heapRel, &itup2->t_tid))
				{
					itup2 = BTReaderGetNextItem(btspool2);
				}
				else
				{
					if (on_duplicate == ON_DUPLICATE_KEEP_NEW)
					{
						self->dup_old++;
						remove_duplicate(self, heapRel, itup2,
							RelationGetRelationName(wstate->index));
						itup2 = BTReaderGetNextItem(btspool2);
					}
					else
					{
						ItemPointerCopy(&t_tid2, &itup2->t_tid);
						self->dup_new++;
						remove_duplicate(self, heapRel, itup,
							RelationGetRelationName(wstate->index));
						itup = BTSpoolGetNextItem(btspool, itup, &should_free);
					}
				}

				continue;
			}
			else if (compare > 0)
				load1 = false;
		}
		else
			load1 = false;

		BULKLOAD_PROFILE(&prof_merge_unique);

		/* When we see first tuple, create first index page */
		if (state == NULL)
			state = _bt_pagestate(wstate, 0);

		if (load1)
		{
			IndexTuple	next_itup = NULL;
			bool		next_should_free = false;

			for (;;)
			{
				/* get next item */
				next_itup = BTSpoolGetNextItem(btspool, next_itup,
											   &next_should_free);

				if (!btspool->isunique || next_itup == NULL)
					break;

				compare = compare_indextuple(itup, next_itup, indexScanKey,
											 keysz, tupdes, &hasnull);
				if (compare < 0 || hasnull)
					break;

				if (compare > 0)
				{
					/* shouldn't happen */
					elog(ERROR, "faild in tuplesort_performsort");
				}

				/*
				 * If tupple is deleted by other unique indexes, not visible
				 */
				if (!heap_is_visible(heapRel, &next_itup->t_tid))
				{
					continue;
				}

				if (!heap_is_visible(heapRel, &itup->t_tid))
				{
					if (should_free)
						pfree(itup);

					itup = next_itup;
					should_free = next_should_free;
					next_should_free = false;
					continue;
				}

				/* not unique between input files */
				self->dup_new++;
				remove_duplicate(self, heapRel, next_itup,
								 RelationGetRelationName(wstate->index));

				if (self->dup_old + self->dup_new > self->max_dup_errors)
					ereport(ERROR,
							(errcode(ERRCODE_INTERNAL_ERROR),
							 errmsg("Maximum duplicate error count exceeded")));
			}

			_bt_buildadd(wstate, state, itup);

			if (should_free)
				pfree(itup);

			itup = next_itup;
			should_free = next_should_free;
		}
		else
		{
			_bt_buildadd(wstate, state, itup2);
			itup2 = BTReaderGetNextItem(btspool2);
		}
		BULKLOAD_PROFILE(&prof_merge_insert);
	}
	_bt_freeskey(indexScanKey);

	/* Close down final pages and write the metapage */
	_bt_uppershutdown(wstate, state);

	/*
	 * If the index isn't temp, we must fsync it down to disk before it's safe
	 * to commit the transaction.  (For a temp index we don't care since the
	 * index will be uninteresting after a crash anyway.)
	 *
	 * It's obvious that we must do this when not WAL-logging the build. It's
	 * less obvious that we have to do it even if we did WAL-log the index
	 * pages.  The reason is that since we're building outside shared buffers,
	 * a CHECKPOINT occurring during the build has no way to flush the
	 * previously written data to disk (indeed it won't know the index even
	 * exists).  A crash later on would replay WAL from the checkpoint,
	 * therefore it wouldn't replay our earlier WAL entries. If we do not
	 * fsync those pages here, they might still not be on disk when the crash
	 * occurs.
	 */
	if (!RELATION_IS_LOCAL(wstate->index))
	{
		RelationOpenSmgr(wstate->index);
		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
	}
	BULKLOAD_PROFILE(&prof_merge_term);
}
Example #9
0
static int
FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request)
{
	int				status = STATUS_OK;
	Page			page;
	Buffer			buf; 
	BlockNumber		numBlocks = 0;
	SMgrRelation	smgr_relation = NULL;
	char			relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1];
	int				ii;
	XLogRecPtr		loc;
	XLogRecPtr		loc1;
	int				count = 0;
	int				thresholdCount = 0;
	bool			mirrorDataLossOccurred = FALSE;
	int				NumberOfRelations = request->count;
	
	FileRepResyncHashEntry_s	entry;
	ChangeTrackingResult		*result = NULL;	

	while (1)
	{
		/* allow flushing buffers from buffer pool during scan */
		FileRepResync_SetReadBufferRequest();
		if ((result = ChangeTracking_GetChanges(request)) != NULL) 
		{
			FileRepResync_ResetReadBufferRequest();
					
			for (ii = 0; ii < result->count; ii++)
			{
				
				if (smgr_relation == NULL)
				{
					NumberOfRelations--;
					
					smgr_relation = smgropen(result->entries[ii].relFileNode);
					
					snprintf(relidstr, sizeof(relidstr), "%u/%u/%u",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode);

					numBlocks = smgrnblocks(smgr_relation);
					
					if (Debug_filerep_print)
						elog(LOG, "resynchronize buffer pool relation '%u/%u/%u' "
							 "number of blocks:'%u' ",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode,
							 numBlocks);
					
					thresholdCount = Min(numBlocks, 1024);
				}
				
				loc1 =  result->entries[ii].lsn_end;
				
				/*
				 * if relation was truncated then block_num from change tracking can be beyond numBlocks 
				 */
				if (result->entries[ii].block_num >=  numBlocks)
				{
					ereport(LOG,	
							(errmsg("could not resynchonize buffer pool relation '%s' block '%d' (maybe due to truncate), "
									"lsn change tracking '%s(%u/%u)' "
									"number of blocks '%d' ",
									relidstr,
									result->entries[ii].block_num,
									XLogLocationToString(&loc1),
									loc1.xlogid,
									loc1.xrecoff,
									numBlocks),						
							 FileRep_errcontext()));						
					
					goto flush_check;
				}
				
				/* allow flushing buffers from buffer pool during scan */
				FileRepResync_SetReadBufferRequest();
				buf = ReadBuffer_Resync(smgr_relation,
										result->entries[ii].block_num,
										relidstr);
				FileRepResync_ResetReadBufferRequest();
				
				Assert(result->entries[ii].block_num < numBlocks);
				
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				page = BufferGetPage(buf);
				
				loc = PageGetLSN(page); 
				
				if(Debug_filerep_print)
				{
					elog(LOG,	
							"incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' "
							"lsn end change tracking '%s(%u/%u)' ",
							relidstr,
							numBlocks,
							result->entries[ii].block_num,
							XLogLocationToString(&loc),
							loc.xlogid,
							loc.xrecoff,
							XLogLocationToString(&loc1),
							result->entries[ii].lsn_end.xlogid,
							result->entries[ii].lsn_end.xrecoff);					
				}
				else
				{
					char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];
					
					snprintf(tmpBuf, sizeof(tmpBuf), 
							 "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' ",
							 relidstr,
							 numBlocks,
							 result->entries[ii].block_num,
							 XLogLocationToString(&loc),
							 loc.xlogid,
							 loc.xrecoff);
					
					FileRep_InsertConfigLogEntry(tmpBuf);
					
					snprintf(tmpBuf, sizeof(tmpBuf), 
							 "incremental resync buffer pool identifier '%s' lsn end change tracking '%s(%u/%u)' ",
							 relidstr,
							 XLogLocationToString(&loc1),
							 result->entries[ii].lsn_end.xlogid,
							 result->entries[ii].lsn_end.xrecoff);
					
					FileRep_InsertConfigLogEntry(tmpBuf);
					
				}
								
				if (XLByteLE(result->entries[ii].lsn_end, PageGetLSN(page)))
				{
					if (! XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end))
					{
						ereport(LOG,
							(errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn less than CT lsn, "
								"lsn end change tracking '%s(%u/%u)' lsn page '%s(%u/%u)' "
								"number of blocks '%d'",
								relidstr,
								result->entries[ii].block_num,
								XLogLocationToString(&loc),
								loc.xlogid,
								loc.xrecoff,
								XLogLocationToString(&loc1),
								loc1.xlogid,
								loc1.xrecoff,
								numBlocks),
							 FileRep_errcontext()));

					}

					/*
					 * It's safe and better to perform write of the page to mirror,
					 * for this case, as primary and mirror data pages should always
					 * be same. So, we might do some extra work but definitely won't
					 * loose out blocks, or error out and need to perform full recovery.
					 * Need to cover for this case as there are some known scenarios where
					 * CT file can have extra records which should have been discarded,
					 * but as we loose out information of xlog LSN cannot be discarded.
					 * One such case is when CT_TRANSIENT being compacted to CT_COMPACT
					 * with specific xlog LSN (to discard extra records) in CT mode gets
					 * interrupted by resync. Compaction during Resync collects all the
					 * CT records and doesn't have xlog LSN information to discard any
					 * extra records from CT_TRANSIENT.
					 */

					smgrwrite(smgr_relation,
							  result->entries[ii].block_num,
							  (char *)BufferGetBlock(buf),
							  FALSE);
				}

#ifdef FAULT_INJECTOR	
				FaultInjector_InjectFaultIfSet(
											   FileRepResyncWorker, 
											   DDLNotSpecified,
											   "",	// databaseName
											   ""); // tableName
#endif				
				
				UnlockReleaseBuffer(buf);
				
#ifdef FAULT_INJECTOR	
				FaultInjector_InjectFaultIfSet(
											   FileRepResyncWorker, 
											   DDLNotSpecified,
											   "",	// databaseName
											   ""); // tableName
#endif				
		
	flush_check:			
				if (((ii + 1) == result->count) ||
					! (result->entries[ii].relFileNode.spcNode == result->entries[ii+1].relFileNode.spcNode &&
					   result->entries[ii].relFileNode.dbNode == result->entries[ii+1].relFileNode.dbNode &&
					   result->entries[ii].relFileNode.relNode == result->entries[ii+1].relFileNode.relNode))
				{
					if (result->ask_for_more == false)
					{
								
						smgrimmedsync(smgr_relation);
						
						smgrclose(smgr_relation);
								 
						smgr_relation = NULL;
							
						FileRep_GetRelationPath(
												 entry.fileName, 
												 result->entries[ii].relFileNode, 
												 0 /* segment file number is always 0 for Buffer Pool */);							 
								 
						status = FileRepResync_UpdateEntry(&entry);
						if (status != STATUS_OK)
						{
							 break;
						}
					}
								 
				}			
							
				if (count > thresholdCount)
				{
					count = 0;
					FileRepSubProcess_ProcessSignals();
					
					if (! (FileRepSubProcess_GetState() == FileRepStateReady && 
						   dataState == DataStateInResync))
					{
						mirrorDataLossOccurred = TRUE;
						break;
					}
				}
				else
					count++;
			}  // for (ii = 0; ii < result->count; ii++)
			
		} // if ((result = ChangeTracking_GetChanges(request)) != NULL) 
		
		FileRepResync_ResetReadBufferRequest();
			
		if (result != NULL && result->ask_for_more == true)
		{
			Assert(request->count == 1);
			request->entries[0].lsn_start = result->next_start_lsn;
		}
		else
		{
			break;
		}

	} // while(1) 
		
	ChangeTracking_FreeRequest(request);
	ChangeTracking_FreeResult(result);
	
	Insist(NumberOfRelations == 0);
	
	if (mirrorDataLossOccurred)
		status = STATUS_ERROR;
	
	return status;	
}
Example #10
0
static int
FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s	*entry)
{

	int				status = STATUS_OK;
	Page			page;
	Buffer			buf; 
	BlockNumber		numBlocks;
	BlockNumber		blkno;
	SMgrRelation	smgr_relation;
	char			relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1];
	XLogRecPtr		loc;
	int				count = 0;
	int				thresholdCount = 0;
	bool			mirrorDataLossOccurred = FALSE;
		
	switch (entry->relStorageMgr)
	{

		case PersistentFileSysRelStorageMgr_BufferPool:
			
			switch (entry->mirrorDataSynchronizationState)
			{
				case MirroredRelDataSynchronizationState_BufferPoolScanIncremental:
				case MirroredRelDataSynchronizationState_FullCopy:

					smgr_relation = smgropen(entry->relFileNode);
					
					numBlocks = smgrnblocks(smgr_relation);

					snprintf(relidstr, sizeof(relidstr), "%u/%u/%u",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode);

					if (Debug_filerep_print)
						elog(LOG, "resync buffer pool relation '%s' number of blocks '%d' ",
							 relidstr, numBlocks);

					thresholdCount = Min(numBlocks, 1024);
					
					/* 
					 * required in order to report how many blocks were synchronized 
					 * if gp_persistent_relation_node does not return that information 
					 */
					if (entry->mirrorBufpoolResyncChangedPageCount == 0)
					{
						entry->mirrorBufpoolResyncChangedPageCount = numBlocks - entry->mirrorBufpoolResyncCkptBlockNum;
					}
					
					for (blkno = entry->mirrorBufpoolResyncCkptBlockNum; blkno < numBlocks; blkno++) 
					{
						XLogRecPtr	endResyncLSN = (isFullResync() ? 
													FileRepResync_GetEndFullResyncLSN() :
													FileRepResync_GetEndIncrResyncLSN());
#ifdef FAULT_INJECTOR
						FaultInjector_InjectFaultIfSet(
													   FileRepResyncWorkerRead,
													   DDLNotSpecified,
													   "",	//databaseName
													   ""); // tableName
#endif				
						
						FileRepResync_SetReadBufferRequest();
						buf = ReadBuffer_Resync(smgr_relation, blkno, relidstr);
						FileRepResync_ResetReadBufferRequest();
						
						LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
						page = BufferGetPage(buf);
						
						loc = PageGetLSN(page);
						
						if (Debug_filerep_print)
						{
							elog(LOG, 
									 "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' "
									 "lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ",
									 relidstr,
									 numBlocks,
									 blkno,
									 XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc),
									 entry->mirrorBufpoolResyncCkptLoc.xlogid,
									 entry->mirrorBufpoolResyncCkptLoc.xrecoff,
									 XLogLocationToString(&loc),
									 loc.xlogid,
									 loc.xrecoff,
									 XLogLocationToString(&endResyncLSN),
									 endResyncLSN.xlogid,
									 endResyncLSN.xrecoff);
						}
						else
						{
							char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];
							
							snprintf(tmpBuf, sizeof(tmpBuf), 
									 "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' ",
									 relidstr,
									 numBlocks,
									 blkno,
									 XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc),
									 entry->mirrorBufpoolResyncCkptLoc.xlogid,
									 entry->mirrorBufpoolResyncCkptLoc.xrecoff);
														
							FileRep_InsertConfigLogEntry(tmpBuf);
							
							snprintf(tmpBuf, sizeof(tmpBuf), 
									 "full resync buffer pool identifier '%s' lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ",
									 relidstr,
									 XLogLocationToString(&loc),
									 loc.xlogid,
									 loc.xrecoff,
									 XLogLocationToString(&endResyncLSN),
									 endResyncLSN.xlogid,
									 endResyncLSN.xrecoff);
							
							FileRep_InsertConfigLogEntry(tmpBuf);
							
						}
						
						if (XLByteLE(PageGetLSN(page), endResyncLSN) &&
							XLByteLE(entry->mirrorBufpoolResyncCkptLoc, PageGetLSN(page))) 
						{
							smgrwrite(smgr_relation, 
									  blkno,
									  (char *)BufferGetBlock(buf),
									  FALSE);
						}
						
#ifdef FAULT_INJECTOR	
						FaultInjector_InjectFaultIfSet(
													   FileRepResyncWorker, 
													   DDLNotSpecified,
													   "",	// databaseName
													   ""); // tableName
#endif				
						
						UnlockReleaseBuffer(buf);
						
						if (count > thresholdCount)
						{
							count = 0;
							FileRepSubProcess_ProcessSignals();
							
							if (! (FileRepSubProcess_GetState() == FileRepStateReady && 
								   dataState == DataStateInResync))
							{
								mirrorDataLossOccurred = TRUE;
								break;
							}
						}
						else
							count++;
					}
						
					if (mirrorDataLossOccurred)
						break;

					if (entry->mirrorDataSynchronizationState != MirroredRelDataSynchronizationState_FullCopy)
					{
						LockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock);
					
						numBlocks = smgrnblocks(smgr_relation);
					
						smgrtruncate(smgr_relation,
								 numBlocks,
								 TRUE /* isTemp, TRUE means to not record in XLOG */,
								 FALSE /* isLocalBuf */,
								 &entry->persistentTid,
								 entry->persistentSerialNum);
								 
						UnlockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock);
					}
					
					smgrimmedsync(smgr_relation);
					smgrclose(smgr_relation);
					
					smgr_relation = NULL;
					break;
					
				case MirroredRelDataSynchronizationState_None:										
				case MirroredRelDataSynchronizationState_DataSynchronized:
					break;
					
				default:
					ereport(LOG, 
							(errmsg("could not resynchronize relation '%u/%u/%u' "
									"mirror synchronization state:'%s(%d)' ",
									entry->relFileNode.relNode,
									entry->relFileNode.spcNode,
									entry->relFileNode.dbNode,
									MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState),
									entry->mirrorDataSynchronizationState)));
					break;
			}
			break;
			
		case PersistentFileSysRelStorageMgr_AppendOnly:
		{
			MirroredAppendOnlyOpen	mirroredOpen;
			int						primaryError;
			bool					mirrorDataLossOccurred;
			char					*buffer = NULL;
			int64					endOffset = entry->mirrorAppendOnlyNewEof;
			int64					startOffset = entry->mirrorAppendOnlyLossEof;
			int32					bufferLen = 0;
			int						retval = 0;
			
			switch (entry->mirrorDataSynchronizationState)
			{
				case MirroredRelDataSynchronizationState_AppendOnlyCatchup:
				case MirroredRelDataSynchronizationState_FullCopy:
					
					/* 
					 * required in order to report how many blocks were synchronized 
					 * if gp_persistent_relation_node does not return that information 
					 */
					if (entry->mirrorBufpoolResyncChangedPageCount == 0)
					{
						entry->mirrorBufpoolResyncChangedPageCount = (endOffset - startOffset) / BLCKSZ;
					}					
					
					/*
					 * The MirroredAppendOnly_OpenResynchonize routine knows we are a resynch worker and
					 * will open BOTH, but write only the MIRROR!!!
					 */
					MirroredAppendOnly_OpenResynchonize(
											&mirroredOpen, 
											&entry->relFileNode,
											entry->segmentFileNum,
											startOffset,
											&primaryError,
											&mirrorDataLossOccurred);
					if (primaryError != 0)
					{
						ereport(ERROR,
								(errcode_for_file_access(),
								 errmsg("could not open file %u/%u/%u.%u : %s",
										entry->relFileNode.dbNode,
										entry->relFileNode.spcNode,
										entry->relFileNode.relNode,
										entry->segmentFileNum,
										strerror(primaryError))));
						
						break;
					}

					if (mirrorDataLossOccurred)
						break;
					
					/* AO and CO Data Store writes 64k size by default */
					bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset);
					buffer = (char*) palloc(bufferLen);
					if (buffer == NULL)
						ereport(ERROR,
								(errcode(ERRCODE_OUT_OF_MEMORY),
								 (errmsg("not enough memory for resynchronization"))));
					
					MemSet(buffer, 0, bufferLen);
					
					while (startOffset < endOffset)
					{
						retval = MirroredAppendOnly_Read(
												&mirroredOpen,
												buffer,
												bufferLen);
						
						if (retval != bufferLen) 
						{
							ereport(ERROR,
									(errcode_for_file_access(),
									 errmsg("could not read from position:" INT64_FORMAT " in file %u/%u/%u.%u : %m",
											startOffset, 
											entry->relFileNode.dbNode,
											entry->relFileNode.spcNode,
											entry->relFileNode.relNode,
											entry->segmentFileNum)));
							
							break;
						}						
						
						MirroredAppendOnly_Append(
											  &mirroredOpen,
											  buffer,
											  bufferLen,
											  &primaryError,
											  &mirrorDataLossOccurred);
						
						if (mirrorDataLossOccurred)
							break;

						Assert(primaryError == 0);	// No primary writes as resync worker.
						
						startOffset += bufferLen;
						/* AO and CO Data Store writes 64k size by default */
						bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset);						
					}
					
					if (buffer) 
					{
						pfree(buffer);
						buffer = NULL;
					}
					
					if (mirrorDataLossOccurred)
						break;
					
					/* Flush written data on Mirror */
					MirroredAppendOnly_Flush(
										&mirroredOpen,
										&primaryError,
										&mirrorDataLossOccurred);
					if (mirrorDataLossOccurred)
						break;
					
					Assert(primaryError == 0);	// Not flushed on primary as resync worker.
					
					/* Close Primary and Mirror */
					MirroredAppendOnly_Close(
										&mirroredOpen,
										&mirrorDataLossOccurred);
								
					break;
					
				case MirroredRelDataSynchronizationState_None:										
				case MirroredRelDataSynchronizationState_DataSynchronized:
					break;					
					
				default:
					ereport(LOG, 
							(errmsg("could not resynchronize relation '%u/%u/%u' "
									"mirror synchronization state:'%s(%d)' ",
									entry->relFileNode.relNode,
									entry->relFileNode.spcNode,
									entry->relFileNode.dbNode,
									MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState),
									entry->mirrorDataSynchronizationState)));
					break;
			}
			
			break;
		}	//case
		default:
			Assert(0);
			break;
	} //switch
	
	if (mirrorDataLossOccurred)
		status = STATUS_ERROR;
	
	return status;
}
Example #11
0
/*
 * Read tuples in correct sort order from tuplesort, and load them into
 * btree leaves.
 */
static void
_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
{
	BTPageState *state = NULL;
	bool		merge = (btspool2 != NULL);
	IndexTuple	itup,
				itup2 = NULL;
	bool		should_free,
				should_free2,
				load1;
	TupleDesc	tupdes = RelationGetDescr(wstate->index);
	int			i,
				keysz = RelationGetNumberOfAttributes(wstate->index);
	ScanKey		indexScanKey = NULL;

	if (merge)
	{
		/*
		 * Another BTSpool for dead tuples exists. Now we have to merge
		 * btspool and btspool2.
		 */

		/* the preparation of merge */
		itup = tuplesort_getindextuple(btspool->sortstate,
									   true, &should_free);
		itup2 = tuplesort_getindextuple(btspool2->sortstate,
										true, &should_free2);
		indexScanKey = _bt_mkscankey_nodata(wstate->index);

		for (;;)
		{
			load1 = true;		/* load BTSpool next ? */
			if (itup2 == NULL)
			{
				if (itup == NULL)
					break;
			}
			else if (itup != NULL)
			{
				for (i = 1; i <= keysz; i++)
				{
					ScanKey		entry;
					Datum		attrDatum1,
								attrDatum2;
					bool		isNull1,
								isNull2;
					int32		compare;

					entry = indexScanKey + i - 1;
					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
					attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
					if (isNull1)
					{
						if (isNull2)
							compare = 0;		/* NULL "=" NULL */
						else if (entry->sk_flags & SK_BT_NULLS_FIRST)
							compare = -1;		/* NULL "<" NOT_NULL */
						else
							compare = 1;		/* NULL ">" NOT_NULL */
					}
					else if (isNull2)
					{
						if (entry->sk_flags & SK_BT_NULLS_FIRST)
							compare = 1;		/* NOT_NULL ">" NULL */
						else
							compare = -1;		/* NOT_NULL "<" NULL */
					}
					else
					{
						compare =
							DatumGetInt32(FunctionCall2Coll(&entry->sk_func,
														 entry->sk_collation,
															attrDatum1,
															attrDatum2));

						if (entry->sk_flags & SK_BT_DESC)
							compare = -compare;
					}
					if (compare > 0)
					{
						load1 = false;
						break;
					}
					else if (compare < 0)
						break;
				}
			}
			else
				load1 = false;

			/* When we see first tuple, create first index page */
			if (state == NULL)
				state = _bt_pagestate(wstate, 0);

			if (load1)
			{
				_bt_buildadd(wstate, state, itup);
				if (should_free)
					pfree(itup);
				itup = tuplesort_getindextuple(btspool->sortstate,
											   true, &should_free);
			}
			else
			{
				_bt_buildadd(wstate, state, itup2);
				if (should_free2)
					pfree(itup2);
				itup2 = tuplesort_getindextuple(btspool2->sortstate,
												true, &should_free2);
			}
		}
		_bt_freeskey(indexScanKey);
	}
	else
	{
		/* merge is unnecessary */
		while ((itup = tuplesort_getindextuple(btspool->sortstate,
											   true, &should_free)) != NULL)
		{
			/* When we see first tuple, create first index page */
			if (state == NULL)
				state = _bt_pagestate(wstate, 0);

			_bt_buildadd(wstate, state, itup);
			if (should_free)
				pfree(itup);
		}
	}

	/* Close down final pages and write the metapage */
	_bt_uppershutdown(wstate, state);

	/*
	 * If the index is WAL-logged, we must fsync it down to disk before it's
	 * safe to commit the transaction.	(For a non-WAL-logged index we don't
	 * care since the index will be uninteresting after a crash anyway.)
	 *
	 * It's obvious that we must do this when not WAL-logging the build. It's
	 * less obvious that we have to do it even if we did WAL-log the index
	 * pages.  The reason is that since we're building outside shared buffers,
	 * a CHECKPOINT occurring during the build has no way to flush the
	 * previously written data to disk (indeed it won't know the index even
	 * exists).  A crash later on would replay WAL from the checkpoint,
	 * therefore it wouldn't replay our earlier WAL entries. If we do not
	 * fsync those pages here, they might still not be on disk when the crash
	 * occurs.
	 */
	if (RelationNeedsWAL(wstate->index))
	{
		RelationOpenSmgr(wstate->index);
		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
	}
}
void PersistentStore_UpdateTuple(
	PersistentStoreData 		*storeData,

	PersistentStoreSharedData 	*storeSharedData,

	ItemPointer 			persistentTid,
				/* TID of the stored tuple. */

	Datum					*values,

	bool					flushToXLog)
				/* When true, the XLOG record for this change will be flushed to disk. */

{
	Relation	persistentRel;
	bool 		*nulls;
	HeapTuple	persistentTuple = NULL;
	XLogRecPtr 	xlogUpdateEndLoc;
	
#ifdef USE_ASSERT_CHECKING
	if (storeSharedData == NULL ||
		!PersistentStoreSharedData_EyecatcherIsValid(storeSharedData))
		elog(ERROR, "Persistent store shared-memory not valid");
#endif
	
	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_ReplaceTuple: Going to update whole tuple at TID %s ('%s', shared data %p)",
			 ItemPointerToString(persistentTid),
			 storeData->tableName,
			 storeSharedData);

	persistentRel = (*storeData->openRel)();

	/*
	 * In order to keep the tuples the exact same size to enable direct reuse of
	 * free tuples, we do not use NULLs.
	 */
	nulls = (bool*)palloc0(storeData->numAttributes * sizeof(bool));
		
	/*
	 * Form the tuple.
	 */
	persistentTuple = heap_form_tuple(persistentRel->rd_att, values, nulls);
	if (!HeapTupleIsValid(persistentTuple))
		elog(ERROR, "Failed to build persistent tuple ('%s')",
		     storeData->tableName);

	persistentTuple->t_self = *persistentTid;

	frozen_heap_inplace_update(persistentRel, persistentTuple);

	/*
	 * Return the XLOG location of the UPDATE tuple's XLOG record.
	 */
	xlogUpdateEndLoc = XLogLastInsertEndLoc();

	heap_freetuple(persistentTuple);

#ifdef FAULT_INJECTOR
	if (FaultInjector_InjectFaultIfSet(SyncPersistentTable,
										DDLNotSpecified,
										"" /* databaseName */,
										"" /* tableName */)== FaultInjectorTypeSkip)
	{
		FlushRelationBuffers(persistentRel);
		smgrimmedsync(persistentRel->rd_smgr);
	}
#endif

	(*storeData->closeRel)(persistentRel);
	
	if (Debug_persistent_store_print)
	{
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_UpdateTuple: Updated whole tuple at TID %s ('%s')",
			 ItemPointerToString(persistentTid),
			 storeData->tableName);

		(*storeData->printTupleCallback)(
									PersistentStore_DebugPrintLevel(),
									"STORE UPDATED TUPLE",
									persistentTid,
									values);
	}

	if (flushToXLog)
	{
		XLogFlush(xlogUpdateEndLoc);
		XLogRecPtr_Zero(&nowaitXLogEndLoc);
	}
	else
		nowaitXLogEndLoc = xlogUpdateEndLoc;
}
static void PersistentStore_InsertTuple(
	PersistentStoreData 		*storeData,

	PersistentStoreSharedData 	*storeSharedData,

	Datum					*values,

	bool					flushToXLog,
				/* When true, the XLOG record for this change will be flushed to disk. */

	ItemPointer 			persistentTid)
				/* TID of the stored tuple. */

{
	Relation	persistentRel;

#ifdef USE_ASSERT_CHECKING
	if (storeSharedData == NULL ||
		!PersistentStoreSharedData_EyecatcherIsValid(storeSharedData))
		elog(ERROR, "Persistent store shared-memory not valid");
#endif

	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_InsertTuple: Going to insert new tuple ('%s', shared data %p)",
			 storeData->tableName,
			 storeSharedData);

	persistentRel = (*storeData->openRel)();

	PersistentStore_DoInsertTuple(
								storeData,
								storeSharedData,
								persistentRel,
								values,
								flushToXLog,
								persistentTid);

#ifdef FAULT_INJECTOR
    if (FaultInjector_InjectFaultIfSet(SyncPersistentTable,
                                        DDLNotSpecified,
                                        "" /* databaseName */,
                                        "" /* tableName */)== FaultInjectorTypeSkip)
    {
        FlushRelationBuffers(persistentRel);
        smgrimmedsync(persistentRel->rd_smgr);
    }
#endif

	(*storeData->closeRel)(persistentRel);
	
	if (Debug_persistent_store_print)
	{
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_InsertTuple: Inserted new tuple at TID %s ('%s')",
			 ItemPointerToString(persistentTid),
			 storeData->tableName);
		
		(*storeData->printTupleCallback)(
									PersistentStore_DebugPrintLevel(),
									"STORE INSERT TUPLE",
									persistentTid,
									values);
	}

}
static int64
PersistentBuild_TruncateAllGpRelationNode(void)
{
	Relation pg_database;
	HeapScanDesc scan;
	HeapTuple tuple;

	int64 count;

	pg_database = heap_open(
						DatabaseRelationId,
						AccessShareLock);

	/*
	 * Truncate gp_relation_node and its index in each database.
	 */
	scan = heap_beginscan(pg_database, SnapshotNow, 0, NULL);
	count = 0;
	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
	{
		Form_pg_database form_pg_database =
						(Form_pg_database)GETSTRUCT(tuple);

		Oid dbOid;
		Oid dattablespace;
		RelFileNode relFileNode;
		SMgrRelation smgrRelation;
		Page btree_metapage;
		
		dbOid = HeapTupleGetOid(tuple);
		dattablespace = form_pg_database->dattablespace;

		if (dbOid == HcatalogDbOid)
			continue;

		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(), 
				 "PersistentBuild_TruncateAllGpRelationNode: dbOid %u, '%s'",
				 dbOid,
				 form_pg_database->datname.data);

		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(), 
				 "Truncating gp_relation_node %u/%u/%u in database oid %u ('%s')",
				 relFileNode.spcNode,
				 relFileNode.dbNode,
				 relFileNode.relNode,
				 dbOid,
				 form_pg_database->datname.data);

		relFileNode.spcNode = dattablespace;
		relFileNode.dbNode = dbOid;
		relFileNode.relNode = GpRelfileNodeRelationId;

		/*
		 * Truncate WITHOUT generating an XLOG record (i.e. pretend it is a temp relation).
		 */
		PersistentBuild_NonTransactionTruncate(&relFileNode);
		count++;

		/*
		 * And, the index.  Unfortunately, the relfilenode OID can change due to a
		 * REINDEX {TABLE|INDEX} command.
		 */
		PersistentBuild_FindGpRelationNodeIndex(
											dbOid,
											dattablespace,
											&relFileNode);

		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(), 
				 "Truncating gp_relation_node_index %u/%u/%u in database oid %u ('%s').  relfilenode different %s, tablespace different %s",
				 relFileNode.spcNode,
				 relFileNode.dbNode,
				 relFileNode.relNode,
				 dbOid,
				 form_pg_database->datname.data,
				 ((relFileNode.relNode != GpRelfileNodeOidIndexId) ? "true" : "false"),
				 ((relFileNode.spcNode != dattablespace) ? "true" : "false"));

		PersistentBuild_NonTransactionTruncate(&relFileNode);

		// The BTree needs an empty meta-data block.
		smgrRelation = smgropen(relFileNode);

		btree_metapage = (Page)palloc(BLCKSZ);
		_bt_initmetapage(btree_metapage, P_NONE, 0);
		smgrwrite(
			smgrRelation, 
			/* blockNum */ 0, 
			(char*)btree_metapage,
			/* isTemp */ false);
		smgrimmedsync(smgrRelation);
		pfree(btree_metapage);

		smgrclose(smgrRelation);

		count++;
	}

	heap_endscan(scan);

	heap_close(pg_database, AccessShareLock);

	return count;
}