Пример #1
0
/* ----------------------------------------------------------------
 *		ExecEndBitmapIndexScan
 * ----------------------------------------------------------------
 */
void
ExecEndBitmapIndexScan(BitmapIndexScanState *node)
{
	Relation	indexRelationDesc;
	IndexScanDesc indexScanDesc;
	IndexScanDesc odIndexScanDesc;

	/*
	 * extract information from the node
	 */
	indexRelationDesc = node->biss_RelationDesc;
	indexScanDesc = node->biss_ScanDesc;
	odIndexScanDesc = node->odbiss_ScanDesc;

	/*
	 * Free the exprcontext ... now dead code, see ExecFreeExprContext
	 */
#ifdef NOT_USED
	if (node->biss_RuntimeContext)
		FreeExprContext(node->biss_RuntimeContext);
#endif

	/*
	 * close the index relation
	 */
	index_endscan(indexScanDesc);
	if (odIndexScanDesc != NULL)
	{
		index_endscan(odIndexScanDesc);
		odIndexScanDesc = NULL;
	}
	index_close(indexRelationDesc);
}
/*
 * Release resources for one part (this includes closing the index and
 * the relation).
 */
static inline void
CleanupOnePartition(IndexScanState *indexState)
{
	Assert(NULL != indexState);

	/* Reset index state and release locks. */
	ExecClearTuple(indexState->ss.ps.ps_ResultTupleSlot);
	ExecClearTuple(indexState->ss.ss_ScanTupleSlot);

	if ((indexState->ss.scan_state & SCAN_SCAN) != 0)
	{
		Assert(indexState->iss_ScanDesc != NULL);
		Assert(indexState->iss_RelationDesc != NULL);
		Assert(indexState->ss.ss_currentRelation != NULL);

		index_endscan(indexState->iss_ScanDesc);
		indexState->iss_ScanDesc = NULL;

		index_close(indexState->iss_RelationDesc, NoLock);
		indexState->iss_RelationDesc = NULL;

		ExecCloseScanRelation(indexState->ss.ss_currentRelation);
		indexState->ss.ss_currentRelation = NULL;
	}

	indexState->ss.scan_state = SCAN_INIT;
}
Пример #3
0
/*
 * systable_endscan_ordered --- close scan, release resources
 */
void
systable_endscan_ordered(SysScanDesc sysscan)
{
	Assert(sysscan->irel);
	index_endscan(sysscan->iscan);
	pfree(sysscan);
}
Пример #4
0
/*
 * _bitmap_cleanup_buildstate() -- clean up the build state after
 *	inserting all rows in the heap into the bitmap index.
 */
void
_bitmap_cleanup_buildstate(Relation index, BMBuildState *bmstate)
{
	/* write out remaining tids in bmstate->bm_tidLicsBuffer */
	BMTidBuildBuf	*tidLocsBuffer = bmstate->bm_tidLocsBuffer;
	_bitmap_write_alltids(index, tidLocsBuffer, bmstate->use_wal);

	pfree(bmstate->bm_tidLocsBuffer);

	if (cur_bmbuild)
	{
		MemoryContextDelete(cur_bmbuild->tmpcxt);
		MemoryContextDelete(cur_bmbuild->hash_cxt);
		pfree(cur_bmbuild->hash_funcs);
		pfree(cur_bmbuild->eq_funcs);
		pfree(cur_bmbuild);
		cur_bmbuild = NULL;
	}
	else
	{
		/* 
		 * We might have build an index on a non-hashable data type, in
		 * which case we will have searched the btree manually. Free associated
		 * memory.
		 */
		index_endscan(bmstate->bm_lov_scanDesc);
		pfree(bmstate->bm_lov_scanKeys);
	}

	_bitmap_close_lov_heapandindex(bmstate->bm_lov_heap,bmstate->bm_lov_index,
						 		   RowExclusiveLock);
}
Пример #5
0
/*
 * GetNewOidWithIndex
 *		Guts of GetNewOid: use the supplied index
 *
 * This is exported separately because there are cases where we want to use
 * an index that will not be recognized by RelationGetOidIndex: TOAST tables
 * and pg_largeobject have indexes that are usable, but have multiple columns
 * and are on ordinary columns rather than a true OID column.  This code
 * will work anyway, so long as the OID is the index's first column.
 *
 * Caller must have a suitable lock on the relation.
 */
Oid
GetNewOidWithIndex(Relation relation, Relation indexrel)
{
	Oid			newOid;
	SnapshotData SnapshotDirty;
	IndexScanDesc scan;
	ScanKeyData key;
	bool		collides;

	InitDirtySnapshot(SnapshotDirty);

	/* Generate new OIDs until we find one not in the table */
	do
	{
		CHECK_FOR_INTERRUPTS();

		newOid = GetNewObjectId();

		ScanKeyInit(&key,
					(AttrNumber) 1,
					BTEqualStrategyNumber, F_OIDEQ,
					ObjectIdGetDatum(newOid));

		/* see notes above about using SnapshotDirty */
		scan = index_beginscan(relation, indexrel,
							   &SnapshotDirty, 1, &key);

		collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection));

		index_endscan(scan);
	} while (collides);

	return newOid;
}
Пример #6
0
/* ----------------------------------------------------------------
 *		ExecEndBitmapIndexScan
 * ----------------------------------------------------------------
 */
void
ExecEndBitmapIndexScan(BitmapIndexScanState *node)
{
	Relation	indexRelationDesc;
	IndexScanDesc indexScanDesc;

	/*
	 * extract information from the node
	 */
	indexRelationDesc = node->biss_RelationDesc;
	indexScanDesc = node->biss_ScanDesc;

	/*
	 * Free the exprcontext ... now dead code, see ExecFreeExprContext
	 */
#ifdef NOT_USED
	if (node->biss_RuntimeContext)
		FreeExprContext(node->biss_RuntimeContext, true);
#endif

	/*
	 * close the index relation (no-op if we didn't open it)
	 */
	if (indexScanDesc)
		index_endscan(indexScanDesc);
	if (indexRelationDesc)
		index_close(indexRelationDesc, NoLock);
}
Пример #7
0
/*
 * Determine size of a large object
 *
 * NOTE: LOs can contain gaps, just like Unix files.  We actually return
 * the offset of the last byte + 1.
 */
static uint32
inv_getsize(LargeObjectDesc *obj_desc)
{
	bool		found = false;
	uint32		lastbyte = 0;
	ScanKeyData skey[1];
	IndexScanDesc sd;
	HeapTuple	tuple;

	Assert(PointerIsValid(obj_desc));

	open_lo_relation();

	ScanKeyInit(&skey[0],
				Anum_pg_largeobject_loid,
				BTEqualStrategyNumber, F_OIDEQ,
				ObjectIdGetDatum(obj_desc->id));

	sd = index_beginscan(lo_heap_r, lo_index_r,
						 obj_desc->snapshot, 1, skey);

	/*
	 * Because the pg_largeobject index is on both loid and pageno, but we
	 * constrain only loid, a backwards scan should visit all pages of the
	 * large object in reverse pageno order.  So, it's sufficient to examine
	 * the first valid tuple (== last valid page).
	 */
	while ((tuple = index_getnext(sd, BackwardScanDirection)) != NULL)
	{
		Form_pg_largeobject data;
		bytea	   *datafield;
		bool		pfreeit;

		found = true;
		if (HeapTupleHasNulls(tuple))	/* paranoia */
			elog(ERROR, "null field found in pg_largeobject");
		data = (Form_pg_largeobject) GETSTRUCT(tuple);
		datafield = &(data->data);		/* see note at top of file */
		pfreeit = false;
		if (VARATT_IS_EXTENDED(datafield))
		{
			datafield = (bytea *)
				heap_tuple_untoast_attr((struct varlena *) datafield);
			pfreeit = true;
		}
		lastbyte = data->pageno * LOBLKSIZE + getbytealen(datafield);
		if (pfreeit)
			pfree(datafield);
		break;
	}

	index_endscan(sd);

	if (!found)
		ereport(ERROR,
				(errcode(ERRCODE_UNDEFINED_OBJECT),
				 errmsg("large object %u does not exist", obj_desc->id)));
	return lastbyte;
}
Пример #8
0
/*
 * Free the index scan descriptor.
 */
static inline void
freeScanDesc(IndexScanState *indexstate)
{
    if (indexstate->iss_ScanDesc != NULL)
    {
        index_endscan(indexstate->iss_ScanDesc);
        indexstate->iss_ScanDesc = NULL;
    }
}
Пример #9
0
/*
 * systable_endscan_ordered --- close scan, release resources
 */
void
systable_endscan_ordered(SysScanDesc sysscan)
{
	Assert(sysscan->irel);
	index_endscan(sysscan->iscan);
	if (sysscan->snapshot)
		UnregisterSnapshot(sysscan->snapshot);
	pfree(sysscan);
}
Пример #10
0
/*
 * Ends a index scan over the visimap store.
 */ 
void
AppendOnlyVisimapStore_EndScan(
	AppendOnlyVisimapStore *visiMapStore,
	IndexScanDesc indexScan)
{
	Assert(visiMapStore);
	Assert(indexScan);

	index_endscan(indexScan);
}
Пример #11
0
/*
 * systable_endscan --- close scan, release resources
 *
 * Note that it's still up to the caller to close the heap relation.
 */
void
systable_endscan(SysScanDesc sysscan)
{
	if (sysscan->irel)
	{
		index_endscan(sysscan->iscan);
		index_close(sysscan->irel, AccessShareLock);
	}
	else
		heap_endscan(sysscan->scan);

	pfree(sysscan);
}
Пример #12
0
/* ----------
 * toast_delete_datum -
 *
 *	Delete a single external stored value.
 * ----------
 */
static void
toast_delete_datum(Relation rel __attribute__((unused)), Datum value)
{
	varattrib  *attr = (varattrib *) DatumGetPointer(value);
	Relation	toastrel;
	Relation	toastidx;
	ScanKeyData toastkey;
	IndexScanDesc toastscan;
	HeapTuple	toasttup;

	if (!VARATT_IS_EXTERNAL(attr))
		return;

	/*
	 * Open the toast relation and its index
	 */
	toastrel = heap_open(attr->va_external.va_toastrelid,
						 RowExclusiveLock);
	toastidx = index_open(toastrel->rd_rel->reltoastidxid, RowExclusiveLock);

	/*
	 * Setup a scan key to fetch from the index by va_valueid (we don't
	 * particularly care whether we see them in sequence or not)
	 */
	ScanKeyInit(&toastkey,
				(AttrNumber) 1,
				BTEqualStrategyNumber, F_OIDEQ,
				ObjectIdGetDatum(attr->va_external.va_valueid));

	/*
	 * Find all the chunks.  (We don't actually care whether we see them in
	 * sequence or not, but since we've already locked the index we might as
	 * well use systable_beginscan_ordered.)
	 */
	toastscan = index_beginscan(toastrel, toastidx,
								SnapshotToast, 1, &toastkey);
	while ((toasttup = index_getnext(toastscan, ForwardScanDirection)) != NULL)
	{
		/*
		 * Have a chunk, delete it
		 */
		simple_heap_delete(toastrel, &toasttup->t_self);
	}

	/*
	 * End scan and close relations
	 */
	index_endscan(toastscan);
	index_close(toastidx, RowExclusiveLock);
	heap_close(toastrel, RowExclusiveLock);
}
Пример #13
0
/* ----------
 * toast_delete_datum -
 *
 *	Delete a single external stored value.
 * ----------
 */
static void
toast_delete_datum(Relation rel, Datum value)
{
	varattrib  *attr = (varattrib *) DatumGetPointer(value);
	Relation	toastrel;
	Relation	toastidx;
	ScanKeyData toastkey;
	IndexScanDesc toastscan;
	HeapTuple	toasttup;

	if (!VARATT_IS_EXTERNAL(attr))
		return;

	/*
	 * Open the toast relation and it's index
	 */
	toastrel = heap_open(attr->va_content.va_external.va_toastrelid,
						 RowExclusiveLock);
	toastidx = index_open(toastrel->rd_rel->reltoastidxid);

	/*
	 * Setup a scan key to fetch from the index by va_valueid (we don't
	 * particularly care whether we see them in sequence or not)
	 */
	ScanKeyEntryInitialize(&toastkey,
						   (bits16) 0,
						   (AttrNumber) 1,
						   (RegProcedure) F_OIDEQ,
			  ObjectIdGetDatum(attr->va_content.va_external.va_valueid));

	/*
	 * Find the chunks by index
	 */
	toastscan = index_beginscan(toastrel, toastidx, SnapshotToast,
								1, &toastkey);
	while ((toasttup = index_getnext(toastscan, ForwardScanDirection)) != NULL)
	{
		/*
		 * Have a chunk, delete it
		 */
		simple_heap_delete(toastrel, &toasttup->t_self);
	}

	/*
	 * End scan and close relations
	 */
	index_endscan(toastscan);
	index_close(toastidx);
	heap_close(toastrel, RowExclusiveLock);
}
Пример #14
0
/* ----------------------------------------------------------------
 *		ExecEndIndexOnlyScan
 * ----------------------------------------------------------------
 */
void
ExecEndIndexOnlyScan(IndexOnlyScanState *node)
{
	Relation	indexRelationDesc;
	IndexScanDesc indexScanDesc;
	Relation	relation;

	/*
	 * extract information from the node
	 */
	indexRelationDesc = node->ioss_RelationDesc;
	indexScanDesc = node->ioss_ScanDesc;
	relation = node->ss.ss_currentRelation;

	/* Release VM buffer pin, if any. */
	if (node->ioss_VMBuffer != InvalidBuffer)
	{
		ReleaseBuffer(node->ioss_VMBuffer);
		node->ioss_VMBuffer = InvalidBuffer;
	}

	/*
	 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
	 */
#ifdef NOT_USED
	ExecFreeExprContext(&node->ss.ps);
	if (node->ioss_RuntimeContext)
		FreeExprContext(node->ioss_RuntimeContext, true);
#endif

	/*
	 * clear out tuple table slots
	 */
	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
	ExecClearTuple(node->ss.ss_ScanTupleSlot);

	/*
	 * close the index relation (no-op if we didn't open it)
	 */
	if (indexScanDesc)
		index_endscan(indexScanDesc);
	if (indexRelationDesc)
		index_close(indexRelationDesc, NoLock);

	/*
	 * close the heap relation.
	 */
	ExecCloseScanRelation(relation);
}
Пример #15
0
/*
 * GetNewOidWithIndex
 *		Guts of GetNewOid: use the supplied index
 *
 * This is exported separately because there are cases where we want to use
 * an index that will not be recognized by RelationGetOidIndex: TOAST tables
 * and pg_largeobject have indexes that are usable, but have multiple columns
 * and are on ordinary columns rather than a true OID column.  This code
 * will work anyway, so long as the OID is the index's first column.
 *
 * Caller must have a suitable lock on the relation.
 */
Oid
GetNewOidWithIndex(Relation relation, Relation indexrel)
{
	Oid			newOid;
	IndexScanDesc scan;
	ScanKeyData key;
	bool		collides;

	/* Generate new OIDs until we find one not in the table */
	do
	{
		CHECK_FOR_INTERRUPTS();

		newOid = GetNewObjectId();

		ScanKeyInit(&key,
					(AttrNumber) 1,
					BTEqualStrategyNumber, F_OIDEQ,
					ObjectIdGetDatum(newOid));

		/* see notes above about using SnapshotDirty */
		scan = index_beginscan(relation, indexrel,
							   SnapshotDirty, 1, &key);

		collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection));

		index_endscan(scan);
	} while (collides);
	
	if (IsSystemNamespace(RelationGetNamespace(relation)))
	{
		if (Gp_role == GP_ROLE_EXECUTE)
		{
			if (relation->rd_id != 2604 /* pg_attrdef */ && relation->rd_id != 2606 /* pg_constraint */ && relation->rd_id != 2615 /* pg_namespace */) 
				elog(DEBUG1,"Allocating Oid %u with index on relid %u %s in EXECUTE mode",newOid,relation->rd_id, RelationGetRelationName(relation));
			else
				elog(DEBUG4,"Allocating Oid %u with index on relid %u %s in EXECUTE mode",newOid,relation->rd_id, RelationGetRelationName(relation));
		}
		if (Gp_role == GP_ROLE_DISPATCH)
		{
			elog(DEBUG5,"Allocating Oid %u with index on relid %u %s in DISPATCH mode",newOid,relation->rd_id,  RelationGetRelationName(relation));
		}
	}

	return newOid;
}
Пример #16
0
/* ----------------------------------------------------------------
 *		ExecEndIndexScan
 * ----------------------------------------------------------------
 */
void
ExecEndIndexScan(index_ss *node)
{
	struct relation* indexRelationDesc;
	struct index_scan* indexScanDesc;
	struct relation* relation;

	/*
	 * extract information from the node
	 */
	indexRelationDesc = node->iss_RelationDesc;
	indexScanDesc = node->iss_ScanDesc;
	relation = node->ss.ss_currentRelation;

	/*
	 * Free the exprcontext(s) ... now dead code, see exec_free_expr_ctx
	 */
#ifdef NOT_USED
	exec_free_expr_ctx(&node->ss.ps);
	if (node->iss_RuntimeContext)
		free_expr_ctx(node->iss_RuntimeContext, true);
#endif

	/*
	 * clear out tuple table slots
	 */
	exec_clear_tuple(node->ss.ps.ps_ResultTupleSlot);
	exec_clear_tuple(node->ss.ss_ScanTupleSlot);

	/*
	 * close the index relation (no-op if we didn't open it)
	 */
	if (indexScanDesc)
		index_endscan(indexScanDesc);

	if (indexRelationDesc)
		index_close(indexRelationDesc, NO_LOCK);

	/*
	 * close the heap relation.
	 */
	ExecCloseScanRelation(relation);
}
Пример #17
0
/*
 * AppendOnlyBlockDirectory_DeleteSegmentFile
 *
 * Deletes all block directory entries for given segment file of an
 * append-only relation.
 */ 
void
AppendOnlyBlockDirectory_DeleteSegmentFile(
		AppendOnlyEntry *aoEntry,
		Snapshot snapshot,
		int segno,
		int columnGroupNo)
{
	Assert(aoEntry);
	Assert(OidIsValid(aoEntry->blkdirrelid));
	Assert(OidIsValid(aoEntry->blkdiridxid));

	Relation blkdirRel = heap_open(aoEntry->blkdirrelid, RowExclusiveLock);
	Relation blkdirIdx = index_open(aoEntry->blkdiridxid, RowExclusiveLock);

	ScanKeyData scanKey;
	ScanKeyInit(&scanKey,
			1, /* segno */
			BTEqualStrategyNumber,
			F_INT4EQ,
			Int32GetDatum(segno));

	IndexScanDesc indexScan = index_beginscan(
			blkdirRel,
			blkdirIdx,
			snapshot,
			1,
			&scanKey);
	
	HeapTuple tuple = NULL;
	while ((tuple = index_getnext(indexScan, ForwardScanDirection)) != NULL)
	{
		simple_heap_delete(blkdirRel,
				&tuple->t_self);
	}
	index_endscan(indexScan);

	index_close(blkdirIdx, RowExclusiveLock);
	heap_close(blkdirRel, RowExclusiveLock);

}
Пример #18
0
/*
 * Bulk deletion of all index entries pointing to a set of heap tuples.
 * The set of target tuples is specified via a callback routine that tells
 * whether any given heap tuple (identified by ItemPointer) is being deleted.
 *
 * Result: a palloc'd struct containing statistical info for VACUUM displays.
 */
Datum
rtbulkdelete(PG_FUNCTION_ARGS)
{
	Relation	rel = (Relation) PG_GETARG_POINTER(0);
	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
	IndexBulkDeleteResult *result;
	BlockNumber num_pages;
	double		tuples_removed;
	double		num_index_tuples;
	IndexScanDesc iscan;

	tuples_removed = 0;
	num_index_tuples = 0;

	/*
	 * Since rtree is not marked "amconcurrent" in pg_am, caller should have
	 * acquired exclusive lock on index relation.  We need no locking here.
	 */

	/*
	 * XXX generic implementation --- should be improved!
	 */

	/* walk through the entire index */
	iscan = index_beginscan(NULL, rel, SnapshotAny, 0, NULL);
	/* including killed tuples */
	iscan->ignore_killed_tuples = false;

	while (index_getnext_indexitem(iscan, ForwardScanDirection))
	{
		vacuum_delay_point();

		if (callback(&iscan->xs_ctup.t_self, callback_state))
		{
			ItemPointerData indextup = iscan->currentItemData;
			BlockNumber blkno;
			OffsetNumber offnum;
			Buffer		buf;
			Page		page;

			blkno = ItemPointerGetBlockNumber(&indextup);
			offnum = ItemPointerGetOffsetNumber(&indextup);

			/* adjust any scans that will be affected by this deletion */
			/* (namely, my own scan) */
			rtadjscans(rel, RTOP_DEL, blkno, offnum);

			/* delete the index tuple */
			buf = ReadBuffer(rel, blkno);
			page = BufferGetPage(buf);

			PageIndexTupleDelete(page, offnum);

			WriteBuffer(buf);

			tuples_removed += 1;
		}
		else
			num_index_tuples += 1;
	}

	index_endscan(iscan);

	/* return statistics */
	num_pages = RelationGetNumberOfBlocks(rel);

	result = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
	result->num_pages = num_pages;
	result->num_index_tuples = num_index_tuples;
	result->tuples_removed = tuples_removed;

	PG_RETURN_POINTER(result);
}
Пример #19
0
/*
 * AppendOnlyBlockDirectory_GetEntry
 *
 * Find a directory entry for the given AOTupleId in the block directory.
 * If such an entry is found, return true. Otherwise, return false.
 *
 * The range for directoryEntry is assigned accordingly in this function.
 *
 * The block directory for the appendonly table should exist before calling
 * this function.
 */
bool
AppendOnlyBlockDirectory_GetEntry(
	AppendOnlyBlockDirectory		*blockDirectory,
	AOTupleId 						*aoTupleId,
	int                             columnGroupNo,
	AppendOnlyBlockDirectoryEntry	*directoryEntry)
{
	int segmentFileNum = AOTupleIdGet_segmentFileNum(aoTupleId);
	int64 rowNum = AOTupleIdGet_rowNum(aoTupleId);
	int		i;
	Relation blkdirRel = blockDirectory->blkdirRel;
	Relation blkdirIdx = blockDirectory->blkdirIdx;
	int numScanKeys = blockDirectory->numScanKeys;
	ScanKey scanKeys = blockDirectory->scanKeys;
	
	TupleDesc heapTupleDesc;
	FileSegInfo *fsInfo = NULL;
	IndexScanDesc idxScanDesc;
	HeapTuple tuple = NULL;
	MinipagePerColumnGroup *minipageInfo =
		&blockDirectory->minipages[columnGroupNo];
	int entry_no = -1;
	int tmpGroupNo;
	
	if (blkdirRel == NULL || blkdirIdx == NULL)
	{
		Assert(RelationIsValid(blockDirectory->aoRel));
		
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERNAL_ERROR),
				 errmsg("Block directory for append-only relation '%s' does not exist",
						RelationGetRelationName(blockDirectory->aoRel))));
		return false;
	}

	ereportif(Debug_appendonly_print_blockdirectory, LOG,
				(errmsg("Append-only block directory get entry: "
						"(columnGroupNo, segmentFileNum, rowNum) = "
						"(%d, %d, " INT64_FORMAT ")",
						columnGroupNo, segmentFileNum, rowNum)));

	/*
	 * If the segment file number is the same as
	 * blockDirectory->currentSegmentFileNum, the in-memory minipage
	 * may contain such an entry. We search the in-memory minipage
	 * first. If such an entry can not be found, we search for the
	 * appropriate minipage by using the block directory btree index.
	 */
	if (segmentFileNum == blockDirectory->currentSegmentFileNum &&
		minipageInfo->numMinipageEntries > 0)
	{
		Assert(blockDirectory->currentSegmentFileInfo != NULL);

		MinipageEntry *firstentry =
			&minipageInfo->minipage->entry[0];
		if (rowNum >= firstentry->firstRowNum)
		{
			/*
			 * Check if the existing minipage contains the requested
			 * rowNum. If so, just get it.
			 */
			entry_no = find_minipage_entry(minipageInfo->minipage,
									   minipageInfo->numMinipageEntries,
									   rowNum);
			if (entry_no != -1)
			{
				return set_directoryentry_range(blockDirectory,
									 columnGroupNo,
									 entry_no,
									 directoryEntry);

			}

			/*
			 * The given rowNum may point to a tuple that does not exist
			 * in the AO table any more, either because of cancellation of
			 * an insert, or due to crashes during an insert. If this is
			 * the case, rowNum is smaller than the highest entry in
			 * the in-memory minipage entry.
			 */
			else
			{
				MinipageEntry *entry =
					&minipageInfo->minipage->entry[minipageInfo->numMinipageEntries - 1];
				
				if (rowNum < entry->firstRowNum + entry->rowCount - 1)
					return false;
			}
		}
	}

	for (i = 0; i < blockDirectory->totalSegfiles; i++)
	{
		fsInfo = blockDirectory->segmentFileInfo[i];
		
		if (!blockDirectory->isAOCol && segmentFileNum == fsInfo->segno)
			break;
		else if (blockDirectory->isAOCol && segmentFileNum ==
				 ((AOCSFileSegInfo*)fsInfo)->segno)
			break;
	}

	Assert(fsInfo != NULL);

	/*
	 * Search the btree index to find the minipage that contains
	 * the rowNum. We find the minipages for all column groups, since
	 * currently we will need to access all columns at the same time.
	 */
	heapTupleDesc = RelationGetDescr(blkdirRel);

	Assert(numScanKeys == 3);

	for (tmpGroupNo = 0; tmpGroupNo < blockDirectory->numColumnGroups; tmpGroupNo++)
	{
		if (blockDirectory->proj && !blockDirectory->proj[tmpGroupNo])
		{
			/* Ignore columns that are not projected. */
			continue;
		}
		/* Setup the scan keys for the scan. */
		Assert(scanKeys != NULL);
		scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum);
		scanKeys[1].sk_argument = Int32GetDatum(tmpGroupNo);
		scanKeys[2].sk_argument = Int64GetDatum(rowNum);
		
		idxScanDesc = index_beginscan(blkdirRel, blkdirIdx,
									  blockDirectory->appendOnlyMetaDataSnapshot,
									  numScanKeys, scanKeys);
	
		tuple = index_getnext(idxScanDesc, BackwardScanDirection);

		if (tuple != NULL)
		{
			/*
			 * MPP-17061: we need to update currentSegmentFileNum
			 * & currentSegmentFileInfo at the same time when we 
			 * load the minipage for the block directory entry we
			 * found, otherwise we would risk having inconsistency
			 * between currentSegmentFileNum/currentSegmentFileInfo
			 * and minipage contents, which would cause wrong block
			 * header offset being returned in following block 
			 * directory entry look up.
			 */
			blockDirectory->currentSegmentFileNum = segmentFileNum;
			blockDirectory->currentSegmentFileInfo = fsInfo;
			
			extract_minipage(blockDirectory,
							 tuple,
							 heapTupleDesc,
							 tmpGroupNo);
		}
		else
		{
			/* MPP-17061: index look up failed, row is invisible */
			index_endscan(idxScanDesc);
			return false;
		}	

		index_endscan(idxScanDesc);
	}

	{
		MinipagePerColumnGroup *minipageInfo;
		minipageInfo = &blockDirectory->minipages[columnGroupNo];

		/*
		 * Perform a binary search over the minipage to find
		 * the entry about the AO block.
		 */
		entry_no = find_minipage_entry(minipageInfo->minipage,
									   minipageInfo->numMinipageEntries,
									   rowNum);

		/* If there are no entries, return false. */
		if (entry_no == -1 && minipageInfo->numMinipageEntries == 0)
			return false;

		if (entry_no == -1)
		{
			/*
			 * Since the last few blocks may not be logged in the block
			 * directory, we always use the last entry.
			 */
			entry_no = minipageInfo->numMinipageEntries - 1;
		}
		return set_directoryentry_range(blockDirectory,
										columnGroupNo,
										entry_no,
										directoryEntry);
	}
	
	return false;
}
Пример #20
0
/* ----------
 * toast_fetch_datum_slice -
 *
 *	Reconstruct a segment of a varattrib from the chunks saved
 *	in the toast relation
 * ----------
 */
static varattrib *
toast_fetch_datum_slice(varattrib *attr, int32 sliceoffset, int32 length)
{
	Relation	toastrel;
	Relation	toastidx;
	ScanKeyData toastkey[3];
	int			nscankeys;
	IndexScanDesc toastscan;
	HeapTuple	ttup;
	TupleDesc	toasttupDesc;
	varattrib  *result;
	int32		attrsize;
	int32		residx;
	int32		nextidx;
	int			numchunks;
	int			startchunk;
	int			endchunk;
	int32		startoffset;
	int32		endoffset;
	int			totalchunks;
	Pointer		chunk;
	bool		isnull;
	int32		chunksize;
	int32		chcpystrt;
	int32		chcpyend;

	attrsize = attr->va_content.va_external.va_extsize;
	totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1;

	if (sliceoffset >= attrsize)
	{
		sliceoffset = 0;
		length = 0;
	}

	if (((sliceoffset + length) > attrsize) || length < 0)
		length = attrsize - sliceoffset;

	result = (varattrib *) palloc(length + VARHDRSZ);
	VARATT_SIZEP(result) = length + VARHDRSZ;

	if (VARATT_IS_COMPRESSED(attr))
		VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED;

	if (length == 0)
		return (result);		/* Can save a lot of work at this point! */

	startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE;
	endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE;
	numchunks = (endchunk - startchunk) + 1;

	startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE;
	endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE;

	/*
	 * Open the toast relation and it's index
	 */
	toastrel = heap_open(attr->va_content.va_external.va_toastrelid,
						 AccessShareLock);
	toasttupDesc = toastrel->rd_att;
	toastidx = index_open(toastrel->rd_rel->reltoastidxid);

	/*
	 * Setup a scan key to fetch from the index. This is either two keys
	 * or three depending on the number of chunks.
	 */
	ScanKeyEntryInitialize(&toastkey[0],
						   (bits16) 0,
						   (AttrNumber) 1,
						   (RegProcedure) F_OIDEQ,
			  ObjectIdGetDatum(attr->va_content.va_external.va_valueid));

	/*
	 * Now dependent on number of chunks:
	 */

	if (numchunks == 1)
	{
		ScanKeyEntryInitialize(&toastkey[1],
							   (bits16) 0,
							   (AttrNumber) 2,
							   (RegProcedure) F_INT4EQ,
							   Int32GetDatum(startchunk));
		nscankeys = 2;
	}
	else
	{
		ScanKeyEntryInitialize(&toastkey[1],
							   (bits16) 0,
							   (AttrNumber) 2,
							   (RegProcedure) F_INT4GE,
							   Int32GetDatum(startchunk));
		ScanKeyEntryInitialize(&toastkey[2],
							   (bits16) 0,
							   (AttrNumber) 2,
							   (RegProcedure) F_INT4LE,
							   Int32GetDatum(endchunk));
		nscankeys = 3;
	}

	/*
	 * Read the chunks by index
	 *
	 * The index is on (valueid, chunkidx) so they will come in order
	 */
	nextidx = startchunk;
	toastscan = index_beginscan(toastrel, toastidx, SnapshotToast,
								nscankeys, toastkey);
	while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL)
	{
		/*
		 * Have a chunk, extract the sequence number and the data
		 */
		residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull));
		Assert(!isnull);
		chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull));
		Assert(!isnull);
		chunksize = VARATT_SIZE(chunk) - VARHDRSZ;

		/*
		 * Some checks on the data we've found
		 */
		if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk))
			elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u",
				 residx, nextidx,
				 attr->va_content.va_external.va_valueid);
		if (residx < totalchunks - 1)
		{
			if (chunksize != TOAST_MAX_CHUNK_SIZE)
				elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u",
					 chunksize, residx,
					 attr->va_content.va_external.va_valueid);
		}
		else
		{
			if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize)
				elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u",
					 chunksize, residx,
					 attr->va_content.va_external.va_valueid);
		}

		/*
		 * Copy the data into proper place in our result
		 */
		chcpystrt = 0;
		chcpyend = chunksize - 1;
		if (residx == startchunk)
			chcpystrt = startoffset;
		if (residx == endchunk)
			chcpyend = endoffset;

		memcpy(((char *) VARATT_DATA(result)) +
			   (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt,
			   VARATT_DATA(chunk) + chcpystrt,
			   (chcpyend - chcpystrt) + 1);

		nextidx++;
	}

	/*
	 * Final checks that we successfully fetched the datum
	 */
	if (nextidx != (endchunk + 1))
		elog(ERROR, "missing chunk number %d for toast value %u",
			 nextidx,
			 attr->va_content.va_external.va_valueid);

	/*
	 * End scan and close relations
	 */
	index_endscan(toastscan);
	index_close(toastidx);
	heap_close(toastrel, AccessShareLock);

	return result;
}
Пример #21
0
/* ----------
 * toast_fetch_datum -
 *
 *	Reconstruct an in memory varattrib from the chunks saved
 *	in the toast relation
 * ----------
 */
static varattrib *
toast_fetch_datum(varattrib *attr)
{
	Relation	toastrel;
	Relation	toastidx;
	ScanKeyData toastkey;
	IndexScanDesc toastscan;
	HeapTuple	ttup;
	TupleDesc	toasttupDesc;
	varattrib  *result;
	int32		ressize;
	int32		residx,
				nextidx;
	int32		numchunks;
	Pointer		chunk;
	bool		isnull;
	int32		chunksize;

	ressize = attr->va_content.va_external.va_extsize;
	numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1;

	result = (varattrib *) palloc(ressize + VARHDRSZ);
	VARATT_SIZEP(result) = ressize + VARHDRSZ;
	if (VARATT_IS_COMPRESSED(attr))
		VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED;

	/*
	 * Open the toast relation and its index
	 */
	toastrel = heap_open(attr->va_content.va_external.va_toastrelid,
						 AccessShareLock);
	toasttupDesc = toastrel->rd_att;
	toastidx = index_open(toastrel->rd_rel->reltoastidxid);

	/*
	 * Setup a scan key to fetch from the index by va_valueid
	 */
	ScanKeyEntryInitialize(&toastkey,
						   (bits16) 0,
						   (AttrNumber) 1,
						   (RegProcedure) F_OIDEQ,
			  ObjectIdGetDatum(attr->va_content.va_external.va_valueid));

	/*
	 * Read the chunks by index
	 *
	 * Note that because the index is actually on (valueid, chunkidx) we will
	 * see the chunks in chunkidx order, even though we didn't explicitly
	 * ask for it.
	 */
	nextidx = 0;

	toastscan = index_beginscan(toastrel, toastidx, SnapshotToast,
								1, &toastkey);
	while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL)
	{
		/*
		 * Have a chunk, extract the sequence number and the data
		 */
		residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull));
		Assert(!isnull);
		chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull));
		Assert(!isnull);
		chunksize = VARATT_SIZE(chunk) - VARHDRSZ;

		/*
		 * Some checks on the data we've found
		 */
		if (residx != nextidx)
			elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u",
				 residx, nextidx,
				 attr->va_content.va_external.va_valueid);
		if (residx < numchunks - 1)
		{
			if (chunksize != TOAST_MAX_CHUNK_SIZE)
				elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u",
					 chunksize, residx,
					 attr->va_content.va_external.va_valueid);
		}
		else if (residx < numchunks)
		{
			if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize)
				elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u",
					 chunksize, residx,
					 attr->va_content.va_external.va_valueid);
		}
		else
			elog(ERROR, "unexpected chunk number %d for toast value %u",
				 residx,
				 attr->va_content.va_external.va_valueid);

		/*
		 * Copy the data into proper place in our result
		 */
		memcpy(((char *) VARATT_DATA(result)) + residx * TOAST_MAX_CHUNK_SIZE,
			   VARATT_DATA(chunk),
			   chunksize);

		nextidx++;
	}

	/*
	 * Final checks that we successfully fetched the datum
	 */
	if (nextidx != numchunks)
		elog(ERROR, "missing chunk number %d for toast value %u",
			 nextidx,
			 attr->va_content.va_external.va_valueid);

	/*
	 * End scan and close relations
	 */
	index_endscan(toastscan);
	index_close(toastidx);
	heap_close(toastrel, AccessShareLock);

	return result;
}
Пример #22
0
void
inv_truncate(LargeObjectDesc *obj_desc, int len)
{
	int32		pageno = (int32) (len / LOBLKSIZE);
	int			off;
	ScanKeyData skey[2];
	IndexScanDesc sd;
	HeapTuple	oldtuple;
	Form_pg_largeobject olddata;
	struct
	{
		bytea		hdr;
		char		data[LOBLKSIZE];	/* make struct big enough */
		int32		align_it;	/* ensure struct is aligned well enough */
	}			workbuf;
	char	   *workb = VARDATA(&workbuf.hdr);
	HeapTuple	newtup;
	Datum		values[Natts_pg_largeobject];
	bool		nulls[Natts_pg_largeobject];
	bool		replace[Natts_pg_largeobject];
	CatalogIndexState indstate;

	Assert(PointerIsValid(obj_desc));

	/* enforce writability because snapshot is probably wrong otherwise */
	if ((obj_desc->flags & IFS_WRLOCK) == 0)
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("large object %u was not opened for writing",
						obj_desc->id)));

	open_lo_relation();

	indstate = CatalogOpenIndexes(lo_heap_r);

	ScanKeyInit(&skey[0],
				Anum_pg_largeobject_loid,
				BTEqualStrategyNumber, F_OIDEQ,
				ObjectIdGetDatum(obj_desc->id));

	ScanKeyInit(&skey[1],
				Anum_pg_largeobject_pageno,
				BTGreaterEqualStrategyNumber, F_INT4GE,
				Int32GetDatum(pageno));

	sd = index_beginscan(lo_heap_r, lo_index_r,
						 obj_desc->snapshot, 2, skey);

	/*
	 * If possible, get the page the truncation point is in. The truncation
	 * point may be beyond the end of the LO or in a hole.
	 */
	olddata = NULL;
	if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL)
	{
		if (HeapTupleHasNulls(oldtuple))		/* paranoia */
			elog(ERROR, "null field found in pg_largeobject");
		olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple);
		Assert(olddata->pageno >= pageno);
	}

	/*
	 * If we found the page of the truncation point we need to truncate the
	 * data in it.	Otherwise if we're in a hole, we need to create a page to
	 * mark the end of data.
	 */
	if (olddata != NULL && olddata->pageno == pageno)
	{
		/* First, load old data into workbuf */
		bytea	   *datafield = &(olddata->data);		/* see note at top of
														 * file */
		bool		pfreeit = false;
		int			pagelen;

		if (VARATT_IS_EXTENDED(datafield))
		{
			datafield = (bytea *)
				heap_tuple_untoast_attr((struct varlena *) datafield);
			pfreeit = true;
		}
		pagelen = getbytealen(datafield);
		Assert(pagelen <= LOBLKSIZE);
		memcpy(workb, VARDATA(datafield), pagelen);
		if (pfreeit)
			pfree(datafield);

		/*
		 * Fill any hole
		 */
		off = len % LOBLKSIZE;
		if (off > pagelen)
			MemSet(workb + pagelen, 0, off - pagelen);

		/* compute length of new page */
		SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ);

		/*
		 * Form and insert updated tuple
		 */
		memset(values, 0, sizeof(values));
		memset(nulls, false, sizeof(nulls));
		memset(replace, false, sizeof(replace));
		values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
		replace[Anum_pg_largeobject_data - 1] = true;
		newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r),
								   values, nulls, replace);
		simple_heap_update(lo_heap_r, &newtup->t_self, newtup);
		CatalogIndexInsert(indstate, newtup);
		heap_freetuple(newtup);
	}
	else
	{
		/*
		 * If the first page we found was after the truncation point, we're in
		 * a hole that we'll fill, but we need to delete the later page.
		 */
		if (olddata != NULL && olddata->pageno > pageno)
			simple_heap_delete(lo_heap_r, &oldtuple->t_self);

		/*
		 * Write a brand new page.
		 *
		 * Fill the hole up to the truncation point
		 */
		off = len % LOBLKSIZE;
		if (off > 0)
			MemSet(workb, 0, off);

		/* compute length of new page */
		SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ);

		/*
		 * Form and insert new tuple
		 */
		memset(values, 0, sizeof(values));
		memset(nulls, false, sizeof(nulls));
		values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
		values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
		values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
		newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls);
		simple_heap_insert(lo_heap_r, newtup);
		CatalogIndexInsert(indstate, newtup);
		heap_freetuple(newtup);
	}

	/*
	 * Delete any pages after the truncation point
	 */
	while ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL)
	{
		simple_heap_delete(lo_heap_r, &oldtuple->t_self);
	}

	index_endscan(sd);

	CatalogCloseIndexes(indstate);

	/*
	 * Advance command counter so that tuple updates will be seen by later
	 * large-object operations in this transaction.
	 */
	CommandCounterIncrement();
}
Пример #23
0
int
inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes)
{
	int			nwritten = 0;
	int			n;
	int			off;
	int			len;
	int32		pageno = (int32) (obj_desc->offset / LOBLKSIZE);
	ScanKeyData skey[2];
	IndexScanDesc sd;
	HeapTuple	oldtuple;
	Form_pg_largeobject olddata;
	bool		neednextpage;
	bytea	   *datafield;
	bool		pfreeit;
	struct
	{
		bytea		hdr;
		char		data[LOBLKSIZE];	/* make struct big enough */
		int32		align_it;	/* ensure struct is aligned well enough */
	}			workbuf;
	char	   *workb = VARDATA(&workbuf.hdr);
	HeapTuple	newtup;
	Datum		values[Natts_pg_largeobject];
	bool		nulls[Natts_pg_largeobject];
	bool		replace[Natts_pg_largeobject];
	CatalogIndexState indstate;

	Assert(PointerIsValid(obj_desc));
	Assert(buf != NULL);

	/* enforce writability because snapshot is probably wrong otherwise */
	if ((obj_desc->flags & IFS_WRLOCK) == 0)
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("large object %u was not opened for writing",
						obj_desc->id)));

	if (nbytes <= 0)
		return 0;

	open_lo_relation();

	indstate = CatalogOpenIndexes(lo_heap_r);

	ScanKeyInit(&skey[0],
				Anum_pg_largeobject_loid,
				BTEqualStrategyNumber, F_OIDEQ,
				ObjectIdGetDatum(obj_desc->id));

	ScanKeyInit(&skey[1],
				Anum_pg_largeobject_pageno,
				BTGreaterEqualStrategyNumber, F_INT4GE,
				Int32GetDatum(pageno));

	sd = index_beginscan(lo_heap_r, lo_index_r,
						 obj_desc->snapshot, 2, skey);

	oldtuple = NULL;
	olddata = NULL;
	neednextpage = true;

	while (nwritten < nbytes)
	{
		/*
		 * If possible, get next pre-existing page of the LO.  We assume the
		 * indexscan will deliver these in order --- but there may be holes.
		 */
		if (neednextpage)
		{
			if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL)
			{
				if (HeapTupleHasNulls(oldtuple))		/* paranoia */
					elog(ERROR, "null field found in pg_largeobject");
				olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple);
				Assert(olddata->pageno >= pageno);
			}
			neednextpage = false;
		}

		/*
		 * If we have a pre-existing page, see if it is the page we want to
		 * write, or a later one.
		 */
		if (olddata != NULL && olddata->pageno == pageno)
		{
			/*
			 * Update an existing page with fresh data.
			 *
			 * First, load old data into workbuf
			 */
			datafield = &(olddata->data);		/* see note at top of file */
			pfreeit = false;
			if (VARATT_IS_EXTENDED(datafield))
			{
				datafield = (bytea *)
					heap_tuple_untoast_attr((struct varlena *) datafield);
				pfreeit = true;
			}
			len = getbytealen(datafield);
			Assert(len <= LOBLKSIZE);
			memcpy(workb, VARDATA(datafield), len);
			if (pfreeit)
				pfree(datafield);

			/*
			 * Fill any hole
			 */
			off = (int) (obj_desc->offset % LOBLKSIZE);
			if (off > len)
				MemSet(workb + len, 0, off - len);

			/*
			 * Insert appropriate portion of new data
			 */
			n = LOBLKSIZE - off;
			n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
			memcpy(workb + off, buf + nwritten, n);
			nwritten += n;
			obj_desc->offset += n;
			off += n;
			/* compute valid length of new page */
			len = (len >= off) ? len : off;
			SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ);

			/*
			 * Form and insert updated tuple
			 */
			memset(values, 0, sizeof(values));
			memset(nulls, false, sizeof(nulls));
			memset(replace, false, sizeof(replace));
			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
			replace[Anum_pg_largeobject_data - 1] = true;
			newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r),
									   values, nulls, replace);
			simple_heap_update(lo_heap_r, &newtup->t_self, newtup);
			CatalogIndexInsert(indstate, newtup);
			heap_freetuple(newtup);

			/*
			 * We're done with this old page.
			 */
			oldtuple = NULL;
			olddata = NULL;
			neednextpage = true;
		}
		else
		{
			/*
			 * Write a brand new page.
			 *
			 * First, fill any hole
			 */
			off = (int) (obj_desc->offset % LOBLKSIZE);
			if (off > 0)
				MemSet(workb, 0, off);

			/*
			 * Insert appropriate portion of new data
			 */
			n = LOBLKSIZE - off;
			n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
			memcpy(workb + off, buf + nwritten, n);
			nwritten += n;
			obj_desc->offset += n;
			/* compute valid length of new page */
			len = off + n;
			SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ);

			/*
			 * Form and insert updated tuple
			 */
			memset(values, 0, sizeof(values));
			memset(nulls, false, sizeof(nulls));
			values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
			values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
			newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls);
			simple_heap_insert(lo_heap_r, newtup);
			CatalogIndexInsert(indstate, newtup);
			heap_freetuple(newtup);
		}
		pageno++;
	}

	index_endscan(sd);

	CatalogCloseIndexes(indstate);

	/*
	 * Advance command counter so that my tuple updates will be seen by later
	 * large-object operations in this transaction.
	 */
	CommandCounterIncrement();

	return nwritten;
}
Пример #24
0
bool
CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared, 
						Relation pg_class)
{
	RelFileNode rnode;
	char	   *rpath;
	int			fd;
	bool		collides;
	
	
	if (pg_class)
	{
		Oid			oidIndex;
		Relation	indexrel;
		IndexScanDesc scan;
		ScanKeyData key;
	
		Assert(!IsBootstrapProcessingMode());
		Assert(pg_class->rd_rel->relhasoids);
	
		/* The relcache will cache the identity of the OID index for us */
		oidIndex = RelationGetOidIndex(pg_class);
	
		Assert(OidIsValid(oidIndex));
		
		indexrel = index_open(oidIndex, AccessShareLock);
		
		ScanKeyInit(&key,
					(AttrNumber) 1,
					BTEqualStrategyNumber, F_OIDEQ,
					ObjectIdGetDatum(newOid));

		scan = index_beginscan(pg_class, indexrel, SnapshotDirty, 1, &key);

		collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection));

		index_endscan(scan);
		
		index_close(indexrel, AccessShareLock);
		
		if (collides)
			elog(ERROR, "relfilenode %d already in use in \"pg_class\"",
				 newOid);	
		
	}

	/* This should match RelationInitPhysicalAddr */
	rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
	rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId;
	
	rnode.relNode = newOid;
	
	/* Check for existing file of same name */
	rpath = relpath(rnode);
	fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);

	if (fd >= 0)
	{
		/* definite collision */
		gp_retry_close(fd);
		collides = true;
	}
	else
		collides = false;

	pfree(rpath);
	
	if (collides && !relisshared)
		elog(ERROR, "oid %d already in use", newOid);	

	while(GetNewObjectId() < newOid);

	return !collides;
}
Пример #25
0
int
inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
{
	int			nread = 0;
	int			n;
	int			off;
	int			len;
	int32		pageno = (int32) (obj_desc->offset / LOBLKSIZE);
	uint32		pageoff;
	ScanKeyData skey[2];
	IndexScanDesc sd;
	HeapTuple	tuple;

	Assert(PointerIsValid(obj_desc));
	Assert(buf != NULL);

	if (nbytes <= 0)
		return 0;

	open_lo_relation();

	ScanKeyInit(&skey[0],
				Anum_pg_largeobject_loid,
				BTEqualStrategyNumber, F_OIDEQ,
				ObjectIdGetDatum(obj_desc->id));

	ScanKeyInit(&skey[1],
				Anum_pg_largeobject_pageno,
				BTGreaterEqualStrategyNumber, F_INT4GE,
				Int32GetDatum(pageno));

	sd = index_beginscan(lo_heap_r, lo_index_r,
						 SnapshotNow, 2, skey);

	while ((tuple = index_getnext(sd, ForwardScanDirection)) != NULL)
	{
		Form_pg_largeobject data;
		bytea	   *datafield;
		bool		pfreeit;

		data = (Form_pg_largeobject) GETSTRUCT(tuple);

		/*
		 * We assume the indexscan will deliver pages in order.  However,
		 * there may be missing pages if the LO contains unwritten
		 * "holes". We want missing sections to read out as zeroes.
		 */
		pageoff = ((uint32) data->pageno) * LOBLKSIZE;
		if (pageoff > obj_desc->offset)
		{
			n = pageoff - obj_desc->offset;
			n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
			MemSet(buf + nread, 0, n);
			nread += n;
			obj_desc->offset += n;
		}

		if (nread < nbytes)
		{
			Assert(obj_desc->offset >= pageoff);
			off = (int) (obj_desc->offset - pageoff);
			Assert(off >= 0 && off < LOBLKSIZE);

			datafield = &(data->data);
			pfreeit = false;
			if (VARATT_IS_EXTENDED(datafield))
			{
				datafield = (bytea *)
					heap_tuple_untoast_attr((varattrib *) datafield);
				pfreeit = true;
			}
			len = getbytealen(datafield);
			if (len > off)
			{
				n = len - off;
				n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
				memcpy(buf + nread, VARDATA(datafield) + off, n);
				nread += n;
				obj_desc->offset += n;
			}
			if (pfreeit)
				pfree(datafield);
		}

		if (nread >= nbytes)
			break;
	}

	index_endscan(sd);

	return nread;
}
Пример #26
0
int
inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
{
	int			nwritten = 0;
	int			n;
	int			off;
	int			len;
	int32		pageno = (int32) (obj_desc->offset / LOBLKSIZE);
	ScanKeyData skey[2];
	IndexScanDesc sd;
	HeapTuple	oldtuple;
	Form_pg_largeobject olddata;
	bool		neednextpage;
	bytea	   *datafield;
	bool		pfreeit;
	struct
	{
		bytea		hdr;
		char		data[LOBLKSIZE];
	}			workbuf;
	char	   *workb = VARATT_DATA(&workbuf.hdr);
	HeapTuple	newtup;
	Datum		values[Natts_pg_largeobject];
	char		nulls[Natts_pg_largeobject];
	char		replace[Natts_pg_largeobject];
	CatalogIndexState indstate;

	Assert(PointerIsValid(obj_desc));
	Assert(buf != NULL);

	if (nbytes <= 0)
		return 0;

	open_lo_relation();

	indstate = CatalogOpenIndexes(lo_heap_r);

	ScanKeyInit(&skey[0],
				Anum_pg_largeobject_loid,
				BTEqualStrategyNumber, F_OIDEQ,
				ObjectIdGetDatum(obj_desc->id));

	ScanKeyInit(&skey[1],
				Anum_pg_largeobject_pageno,
				BTGreaterEqualStrategyNumber, F_INT4GE,
				Int32GetDatum(pageno));

	sd = index_beginscan(lo_heap_r, lo_index_r,
						 SnapshotNow, 2, skey);

	oldtuple = NULL;
	olddata = NULL;
	neednextpage = true;

	while (nwritten < nbytes)
	{
		/*
		 * If possible, get next pre-existing page of the LO.  We assume
		 * the indexscan will deliver these in order --- but there may be
		 * holes.
		 */
		if (neednextpage)
		{
			if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL)
			{
				olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple);
				Assert(olddata->pageno >= pageno);
			}
			neednextpage = false;
		}

		/*
		 * If we have a pre-existing page, see if it is the page we want
		 * to write, or a later one.
		 */
		if (olddata != NULL && olddata->pageno == pageno)
		{
			/*
			 * Update an existing page with fresh data.
			 *
			 * First, load old data into workbuf
			 */
			datafield = &(olddata->data);
			pfreeit = false;
			if (VARATT_IS_EXTENDED(datafield))
			{
				datafield = (bytea *)
					heap_tuple_untoast_attr((varattrib *) datafield);
				pfreeit = true;
			}
			len = getbytealen(datafield);
			Assert(len <= LOBLKSIZE);
			memcpy(workb, VARDATA(datafield), len);
			if (pfreeit)
				pfree(datafield);

			/*
			 * Fill any hole
			 */
			off = (int) (obj_desc->offset % LOBLKSIZE);
			if (off > len)
				MemSet(workb + len, 0, off - len);

			/*
			 * Insert appropriate portion of new data
			 */
			n = LOBLKSIZE - off;
			n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
			memcpy(workb + off, buf + nwritten, n);
			nwritten += n;
			obj_desc->offset += n;
			off += n;
			/* compute valid length of new page */
			len = (len >= off) ? len : off;
			VARATT_SIZEP(&workbuf.hdr) = len + VARHDRSZ;

			/*
			 * Form and insert updated tuple
			 */
			memset(values, 0, sizeof(values));
			memset(nulls, ' ', sizeof(nulls));
			memset(replace, ' ', sizeof(replace));
			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
			replace[Anum_pg_largeobject_data - 1] = 'r';
			newtup = heap_modifytuple(oldtuple, lo_heap_r,
									  values, nulls, replace);
			simple_heap_update(lo_heap_r, &newtup->t_self, newtup);
			CatalogIndexInsert(indstate, newtup);
			heap_freetuple(newtup);

			/*
			 * We're done with this old page.
			 */
			oldtuple = NULL;
			olddata = NULL;
			neednextpage = true;
		}
		else
		{
			/*
			 * Write a brand new page.
			 *
			 * First, fill any hole
			 */
			off = (int) (obj_desc->offset % LOBLKSIZE);
			if (off > 0)
				MemSet(workb, 0, off);

			/*
			 * Insert appropriate portion of new data
			 */
			n = LOBLKSIZE - off;
			n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
			memcpy(workb + off, buf + nwritten, n);
			nwritten += n;
			obj_desc->offset += n;
			/* compute valid length of new page */
			len = off + n;
			VARATT_SIZEP(&workbuf.hdr) = len + VARHDRSZ;

			/*
			 * Form and insert updated tuple
			 */
			memset(values, 0, sizeof(values));
			memset(nulls, ' ', sizeof(nulls));
			values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
			values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
			newtup = heap_formtuple(lo_heap_r->rd_att, values, nulls);
			simple_heap_insert(lo_heap_r, newtup);
			CatalogIndexInsert(indstate, newtup);
			heap_freetuple(newtup);
		}
		pageno++;
	}

	index_endscan(sd);

	CatalogCloseIndexes(indstate);

	/*
	 * Advance command counter so that my tuple updates will be seen by
	 * later large-object operations in this transaction.
	 */
	CommandCounterIncrement();

	return nwritten;
}
Пример #27
0
/*
 * GetNewSequenceRelationOid
 *		Get a sequence relation Oid and verify it is valid against
 *		the pg_class relation by doing an index lookup. The caller
 *		should have a suitable lock on pg_class.
 */
Oid
GetNewSequenceRelationOid(Relation relation)
{
	Oid			newOid;
	Oid			oidIndex;
	Relation	indexrel;
	SnapshotData SnapshotDirty;
	IndexScanDesc scan;
	ScanKeyData key;
	bool		collides;
	RelFileNode rnode;
	char	   *rpath;
	int			fd;

	/* This should match RelationInitPhysicalAddr */
	rnode.spcNode = relation->rd_rel->reltablespace ? relation->rd_rel->reltablespace : MyDatabaseTableSpace;
	rnode.dbNode = relation->rd_rel->relisshared ? InvalidOid : MyDatabaseId;

	/* We should only be using pg_class */
	Assert(RelationGetRelid(relation) == RelationRelationId);

	/* The relcache will cache the identity of the OID index for us */
	oidIndex = RelationGetOidIndex(relation);

	/* Otherwise, use the index to find a nonconflicting OID */
	indexrel = index_open(oidIndex, AccessShareLock);

	InitDirtySnapshot(SnapshotDirty);

	/* Generate new sequence relation OIDs until we find one not in the table */
	do
	{
		CHECK_FOR_INTERRUPTS();

		newOid = GetNewSequenceRelationObjectId();

		ScanKeyInit(&key,
					(AttrNumber) 1,
					BTEqualStrategyNumber, F_OIDEQ,
					ObjectIdGetDatum(newOid));

		/* see notes above about using SnapshotDirty */
		scan = index_beginscan(relation, indexrel,
							   &SnapshotDirty, 1, &key);

		collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection));

		index_endscan(scan);

		if (!collides)
		{
			/* Check for existing file of same name */
			rpath = relpath(rnode);
			fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);

			if (fd >= 0)
			{
				/* definite collision */
				gp_retry_close(fd);
				collides = true;
			}
			else
			{
				/*
				 * Here we have a little bit of a dilemma: if errno is something
				 * other than ENOENT, should we declare a collision and loop? In
				 * particular one might think this advisable for, say, EPERM.
				 * However there really shouldn't be any unreadable files in a
				 * tablespace directory, and if the EPERM is actually complaining
				 * that we can't read the directory itself, we'd be in an infinite
				 * loop.  In practice it seems best to go ahead regardless of the
				 * errno.  If there is a colliding file we will get an smgr
				 * failure when we attempt to create the new relation file.
				 */
				collides = false;
			}
		}

		/*
		 * Also check that the OID hasn't been pre-assigned for a different
		 * relation.
		 *
		 * We're a bit sloppy between OIDs and relfilenodes here; it would be
		 * OK to use a value that's been reserved for use as a type or
		 * relation OID here, as long as the relfilenode is free. But there's
		 * no harm in skipping over those too, so we don't bother to
		 * distinguish them.
		 */
		if (!collides && !IsOidAcceptable(newOid))
			collides = true;

	} while (collides);

	index_close(indexrel, AccessShareLock);

	return newOid;
}
Пример #28
0
/*
 * Search the relation 'rel' for tuple using the index.
 *
 * If a matching tuple is found, lock it with lockmode, fill the slot with its
 * contents, and return true.  Return false otherwise.
 */
bool
RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
							 LockTupleMode lockmode,
							 TupleTableSlot *searchslot,
							 TupleTableSlot *outslot)
{
	HeapTuple	scantuple;
	ScanKeyData skey[INDEX_MAX_KEYS];
	IndexScanDesc scan;
	SnapshotData snap;
	TransactionId xwait;
	Relation	idxrel;
	bool		found;

	/* Open the index. */
	idxrel = index_open(idxoid, RowExclusiveLock);

	/* Start an index scan. */
	InitDirtySnapshot(snap);
	scan = index_beginscan(rel, idxrel, &snap,
						   RelationGetNumberOfAttributes(idxrel),
						   0);

	/* Build scan key. */
	build_replindex_scan_key(skey, rel, idxrel, searchslot);

retry:
	found = false;

	index_rescan(scan, skey, RelationGetNumberOfAttributes(idxrel), NULL, 0);

	/* Try to find the tuple */
	if ((scantuple = index_getnext(scan, ForwardScanDirection)) != NULL)
	{
		found = true;
		ExecStoreTuple(scantuple, outslot, InvalidBuffer, false);
		ExecMaterializeSlot(outslot);

		xwait = TransactionIdIsValid(snap.xmin) ?
			snap.xmin : snap.xmax;

		/*
		 * If the tuple is locked, wait for locking transaction to finish and
		 * retry.
		 */
		if (TransactionIdIsValid(xwait))
		{
			XactLockTableWait(xwait, NULL, NULL, XLTW_None);
			goto retry;
		}
	}

	/* Found tuple, try to lock it in the lockmode. */
	if (found)
	{
		Buffer		buf;
		HeapUpdateFailureData hufd;
		HTSU_Result res;
		HeapTupleData locktup;

		ItemPointerCopy(&outslot->tts_tuple->t_self, &locktup.t_self);

		PushActiveSnapshot(GetLatestSnapshot());

		res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false),
							  lockmode,
							  LockWaitBlock,
							  false /* don't follow updates */ ,
							  &buf, &hufd);
		/* the tuple slot already has the buffer pinned */
		ReleaseBuffer(buf);

		PopActiveSnapshot();

		switch (res)
		{
			case HeapTupleMayBeUpdated:
				break;
			case HeapTupleUpdated:
				/* XXX: Improve handling here */
				ereport(LOG,
						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
						 errmsg("concurrent update, retrying")));
				goto retry;
			case HeapTupleInvisible:
				elog(ERROR, "attempted to lock invisible tuple");
			default:
				elog(ERROR, "unexpected heap_lock_tuple status: %u", res);
				break;
		}
	}

	index_endscan(scan);

	/* Don't release lock until commit. */
	index_close(idxrel, NoLock);

	return found;
}
Пример #29
0
/* ----------
 * toast_fetch_datum -
 *
 *	Reconstruct an in memory Datum from the chunks saved
 *	in the toast relation
 * ----------
 */
static struct varlena *
toast_fetch_datum(struct varlena *attr)
{
	Relation	toastrel;
	Relation	toastidx;
	ScanKeyData toastkey;
	IndexScanDesc toastscan;
	HeapTuple	ttup;
	TupleDesc	toasttupDesc;
	varattrib  *result;
	int32		ressize;
	int32		residx,
				nextidx;
	int32		numchunks;
	Pointer		chunk;
	bool		isnull;
	int32		chunksize;
	void 	   *chunkdata;

	ressize = ((varattrib *)attr)->va_external.va_extsize;
	numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1;

	result = (varattrib *) palloc(ressize + VARHDRSZ);
	SET_VARSIZE(result, ressize + VARHDRSZ);
	if (VARATT_EXTERNAL_IS_COMPRESSED(attr))
		VARATT_SET_COMPRESSED(result);
	
	/*
	 * Open the toast relation and its index
	 */
	toastrel = heap_open(((varattrib *)attr)->va_external.va_toastrelid, AccessShareLock);
	toasttupDesc = toastrel->rd_att;
	toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock);

	/*
	 * Setup a scan key to fetch from the index by va_valueid
	 */
	ScanKeyInit(&toastkey,
				(AttrNumber) 1,
				BTEqualStrategyNumber, F_OIDEQ,
				ObjectIdGetDatum(((varattrib *)attr)->va_external.va_valueid));

	/*
	 * Read the chunks by index
	 *
	 * Note that because the index is actually on (valueid, chunkidx) we will
	 * see the chunks in chunkidx order, even though we didn't explicitly ask
	 * for it.
	 */
	nextidx = 0;

	toastscan = index_beginscan(toastrel, toastidx,
								SnapshotToast, 1, &toastkey);
	while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL)
	{
		/*
		 * Have a chunk, extract the sequence number and the data
		 */
		residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull));
		Assert(!isnull);
		chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull));
		Assert(!isnull);
		if (VARATT_IS_SHORT(chunk)) 
		{
			chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
			chunkdata = VARDATA_SHORT(chunk);
		}
		else if (!VARATT_IS_EXTENDED(chunk)) 
		{
			chunksize = VARSIZE(chunk) - VARHDRSZ;
			chunkdata = VARDATA(chunk);
		}
		else 
		{
			elog(ERROR, "found toasted toast chunk?");
			chunksize = 0; /* shut compiler up */
			chunkdata = NULL;
		}

		/*
		 * Some checks on the data we've found
		 */
		if (residx != nextidx)
			elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u",
				 residx, nextidx,
				 ((varattrib *)attr)->va_external.va_valueid);
		if (residx < numchunks - 1)
		{
			if (chunksize != TOAST_MAX_CHUNK_SIZE)
				elog(ERROR, "unexpected chunk size %d in chunk %d of %d for toast value %u (expected %d)",
					 chunksize, residx,
					 ((varattrib *)attr)->va_external.va_valueid, numchunks-1,
					 (int)TOAST_MAX_CHUNK_SIZE);
		}
		else if (residx == numchunks-1)
		{
			if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize)
				elog(ERROR, "unexpected chunk size %d in final chunk %d for toast value %u (expected %d)",
					 chunksize, residx,
					 ((varattrib *)attr)->va_external.va_valueid,
					 ressize - residx*(int)TOAST_MAX_CHUNK_SIZE);
		}
		else
			elog(ERROR, "unexpected chunk number %d for toast value %u (expected in %d..%d)",
				 residx,
				 ((varattrib *)attr)->va_external.va_valueid,
				 0, numchunks-1);

		/*
		 * Copy the data into proper place in our result
		 */
		memcpy(((char *) VARDATA(result)) + residx * TOAST_MAX_CHUNK_SIZE,
			   chunkdata,
			   chunksize);

		nextidx++;
	}

	/*
	 * Final checks that we successfully fetched the datum
	 */
	if (nextidx != numchunks)
		elog(ERROR, "missing chunk number %d for toast value %u",
			 nextidx,
			 ((varattrib *)attr)->va_external.va_valueid);

	/*
	 * End scan and close relations
	 */
	index_endscan(toastscan);
	index_close(toastidx, AccessShareLock);
	heap_close(toastrel, AccessShareLock);

	return (struct varlena *)result;
}
Пример #30
0
/* ----------
 * toast_fetch_datum_slice -
 *
 *	Reconstruct a segment of a Datum from the chunks saved
 *	in the toast relation
 * ----------
 */
static struct varlena *
toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, int32 length)
{
	Relation	toastrel;
	Relation	toastidx;
	ScanKeyData toastkey[3];
	int			nscankeys;
	IndexScanDesc toastscan;
	HeapTuple	ttup;
	TupleDesc	toasttupDesc;
	varattrib  *result;
	int32		attrsize;
	int32		residx;
	int32		nextidx;
	int			numchunks;
	int			startchunk;
	int			endchunk;
	int32		startoffset;
	int32		endoffset;
	int			totalchunks;
	Pointer		chunk;
	bool		isnull;
	int32		chunksize;
	int32		chcpystrt;
	int32		chcpyend;

	attrsize = ((varattrib *)attr)->va_external.va_extsize;
	totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1;

	if (sliceoffset >= attrsize)
	{
		sliceoffset = 0;
		length = 0;
	}

	if (((sliceoffset + length) > attrsize) || length < 0)
		length = attrsize - sliceoffset;

	result = (varattrib *) palloc(length + VARHDRSZ);
	SET_VARSIZE(result, length + VARHDRSZ);

	if (VARATT_EXTERNAL_IS_COMPRESSED(attr))
		VARATT_SET_COMPRESSED(result);

	if (length == 0)
		return (struct varlena *)result;			/* Can save a lot of work at this point! */

	startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE;
	endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE;
	numchunks = (endchunk - startchunk) + 1;

	startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE;
	endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE;

	/*
	 * Open the toast relation and its index
	 */
	toastrel = heap_open(((varattrib *)attr)->va_external.va_toastrelid, AccessShareLock);
	toasttupDesc = toastrel->rd_att;
	toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock);

	/*
	 * Setup a scan key to fetch from the index. This is either two keys or
	 * three depending on the number of chunks.
	 */
	ScanKeyInit(&toastkey[0],
				(AttrNumber) 1,
				BTEqualStrategyNumber, F_OIDEQ,
				ObjectIdGetDatum(((varattrib *)attr)->va_external.va_valueid));

	/*
	 * Use equality condition for one chunk, a range condition otherwise:
	 */
	if (numchunks == 1)
	{
		ScanKeyInit(&toastkey[1],
					(AttrNumber) 2,
					BTEqualStrategyNumber, F_INT4EQ,
					Int32GetDatum(startchunk));
		nscankeys = 2;
	}
	else
	{
		ScanKeyInit(&toastkey[1],
					(AttrNumber) 2,
					BTGreaterEqualStrategyNumber, F_INT4GE,
					Int32GetDatum(startchunk));
		ScanKeyInit(&toastkey[2],
					(AttrNumber) 2,
					BTLessEqualStrategyNumber, F_INT4LE,
					Int32GetDatum(endchunk));
		nscankeys = 3;
	}

	/*
	 * Read the chunks by index
	 *
	 * The index is on (valueid, chunkidx) so they will come in order
	 */
	nextidx = startchunk;
	toastscan = index_beginscan(toastrel, toastidx,
								SnapshotToast, nscankeys, toastkey);
	while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL)
	{
		/*
		 * Have a chunk, extract the sequence number and the data
		 */
		residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull));
		Assert(!isnull);
		chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull));
		Assert(!isnull);
		if (VARATT_IS_SHORT((varattrib *)chunk))
			chunksize = VARSIZE_SHORT((varattrib *)chunk) - VARHDRSZ_SHORT;
		else if (!VARATT_IS_EXTENDED((varattrib *)chunk))
			chunksize = VARSIZE((varattrib *)chunk) - VARHDRSZ;
		else {
			elog(ERROR, "found toasted toast chunk?");
			chunksize = 0; /* shut compiler up */
		}
		
		
		/*
		 * Some checks on the data we've found
		 */
		if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk))
			elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u",
				 residx, nextidx,
				 ((varattrib *)attr)->va_external.va_valueid);
		if (residx < totalchunks - 1)
		{
			if (chunksize != TOAST_MAX_CHUNK_SIZE)
				elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u of %d when fetching slice (expected %d)",
					 chunksize, residx,
					 ((varattrib *)attr)->va_external.va_valueid, totalchunks-1,
					 (int)TOAST_MAX_CHUNK_SIZE);
		}
		else if (residx == totalchunks-1)
		{
			if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize)
				elog(ERROR, "unexpected chunk size %d in chunk %d for final toast value %u when fetching slice (expected %d)",
					 chunksize, residx,
					 ((varattrib *)attr)->va_external.va_valueid,
					 attrsize - residx * (int)TOAST_MAX_CHUNK_SIZE);
		}
		else 
		{
			elog(ERROR, "unexpected chunk");
		}
		

		/*
		 * Copy the data into proper place in our result
		 */
		chcpystrt = 0;
		chcpyend = chunksize - 1;
		if (residx == startchunk)
			chcpystrt = startoffset;
		if (residx == endchunk)
			chcpyend = endoffset;

		memcpy(((char *) VARDATA(result)) +
			   (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt,
			   VARDATA((varattrib *)chunk) + chcpystrt,
			   (chcpyend - chcpystrt) + 1);

		nextidx++;
	}

	/*
	 * Final checks that we successfully fetched the datum
	 */
	if (nextidx != (endchunk + 1))
		elog(ERROR, "missing chunk number %d for toast value %u",
			 nextidx,
			 ((varattrib *)attr)->va_external.va_valueid);

	/*
	 * End scan and close relations
	 */
	index_endscan(toastscan);
	index_close(toastidx, AccessShareLock);
	heap_close(toastrel, AccessShareLock);

	return (struct varlena *)result;
}