Esempio n. 1
0
Datum
currtid_byreloid(PG_FUNCTION_ARGS)
{
	Oid			reloid = PG_GETARG_OID(0);
	ItemPointer tid = PG_GETARG_ITEMPOINTER(1);
	ItemPointer result;
	Relation	rel;

	result = (ItemPointer) palloc(sizeof(ItemPointerData));
	if (!reloid)
	{
		*result = Current_last_tid;
		PG_RETURN_ITEMPOINTER(result);
	}

	rel = heap_open(reloid, AccessShareLock);
	if (rel->rd_rel->relkind == RELKIND_VIEW)
		return currtid_for_view(rel, tid);

	ItemPointerCopy(tid, result);
	heap_get_latest_tid(rel, SnapshotNow, result);

	heap_close(rel, AccessShareLock);

	PG_RETURN_ITEMPOINTER(result);
}
Esempio n. 2
0
Datum
currtid_byreloid(PG_FUNCTION_ARGS)
{
	Oid			reloid = PG_GETARG_OID(0);
	ItemPointer tid = PG_GETARG_ITEMPOINTER(1);
	ItemPointer result;
	Relation	rel;
	AclResult	aclresult;

	result = (ItemPointer) palloc(sizeof(ItemPointerData));
	if (!reloid)
	{
		*result = Current_last_tid;
		PG_RETURN_ITEMPOINTER(result);
	}

	rel = heap_open(reloid, AccessShareLock);

	aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(),
								  ACL_SELECT);
	if (aclresult != ACLCHECK_OK)
		aclcheck_error(aclresult, ACL_KIND_CLASS,
					   RelationGetRelationName(rel));

	if (rel->rd_rel->relkind == RELKIND_VIEW)
		return currtid_for_view(rel, tid);

	ItemPointerCopy(tid, result);
	heap_get_latest_tid(rel, SnapshotNow, result);

	heap_close(rel, AccessShareLock);

	PG_RETURN_ITEMPOINTER(result);
}
void
PersistentTablespace_ActivateStandby(int16 oldmaster, int16 newmaster)
{
	TablespaceDirEntry tablespaceDirEntry;
	HASH_SEQ_STATUS hstat;

	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	hash_seq_init(&hstat, persistentTablespaceSharedHashTable);

	if (Persistent_BeforePersistenceWork())
		elog(ERROR, "persistent table changes forbidden");

	PersistentTablespace_VerifyInitScan();

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	LWLockAcquire(TablespaceHashLock, LW_SHARED);
	while ((tablespaceDirEntry = hash_seq_search(&hstat)) != NULL)
	{
		PersistentFileSysObjName fsObjName;
		Oid			tblspc = tablespaceDirEntry->key.tablespaceOid;
		ItemPointerData persistentTid;
		uint64		persistentSerialNum;

		tablespaceDirEntry = PersistentTablespace_FindEntryUnderLock(tblspc);

		if (tablespaceDirEntry == NULL)
			elog(ERROR, "cannot find persistent tablespace entry %u",
				 tblspc);

		persistentSerialNum = tablespaceDirEntry->persistentSerialNum;
		ItemPointerCopy(&tablespaceDirEntry->persistentTid, &persistentTid);

		/*
		 * We release TablespaceHashLock in the middle of the loop and
		 * re-acquire it after doing persistent table change.  This is needed
		 * to prevent holding the lock for any purpose other than to protect
		 * the tablespace shared hash table.  Not releasing this lock could
		 * result in file I/O and potential deadlock due to other LW locks
		 * being acquired in the process.  Releasing the lock this way is safe
		 * because we are still holding PersistentObjLock in exclusive mode.
		 * Any change to the filespace shared hash table is also protected by
		 * PersistentObjLock.
		 */

		LWLockRelease(TablespaceHashLock);

		PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tblspc);
		PersistentFileSysObj_ActivateStandby(&fsObjName,
											 &persistentTid,
											 persistentSerialNum,
											 oldmaster,
											 newmaster,
											  /* flushToXlog */ false);
		LWLockAcquire(TablespaceHashLock, LW_SHARED);
	}
	LWLockRelease(TablespaceHashLock);
	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;
}
Esempio n. 4
0
Datum
currtid_byrelname(PG_FUNCTION_ARGS)
{
	text	   *relname = PG_GETARG_TEXT_P(0);
	ItemPointer tid = PG_GETARG_ITEMPOINTER(1);
	ItemPointer result;
	RangeVar   *relrv;
	Relation	rel;
	AclResult	aclresult;
	Snapshot	snapshot;

	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
	rel = heap_openrv(relrv, AccessShareLock);

	aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(),
								  ACL_SELECT);
	if (aclresult != ACLCHECK_OK)
		aclcheck_error(aclresult, ACL_KIND_CLASS,
					   RelationGetRelationName(rel));

	if (rel->rd_rel->relkind == RELKIND_VIEW || rel->rd_rel->relkind == RELKIND_CONTVIEW)
		return currtid_for_view(rel, tid);

	result = (ItemPointer) palloc(sizeof(ItemPointerData));
	ItemPointerCopy(tid, result);

	snapshot = RegisterSnapshot(GetLatestSnapshot());
	heap_get_latest_tid(rel, snapshot, result);
	UnregisterSnapshot(snapshot);

	heap_close(rel, AccessShareLock);

	PG_RETURN_ITEMPOINTER(result);
}
Esempio n. 5
0
/*
 * Fetches the next entry from a visimap store index scan.
 *
 * Parameter visiMapEntry may be NULL. If it is not NULL and
 * the scan returns an entry, the entry data is copied to the
 * visimapEntry.
 * Parameter tupleTid may be NULL. If it is not NULL and the scan
 * returns an entry, the (heap) tuple id is copied to the parameter.
 */
bool
AppendOnlyVisimapStore_GetNext(
	AppendOnlyVisimapStore *visiMapStore,
	IndexScanDesc indexScan,
	ScanDirection scanDirection,
	AppendOnlyVisimapEntry* visiMapEntry,
	ItemPointerData *tupleTid)
{
	HeapTuple tuple;
	TupleDesc heapTupleDesc;

	Assert(visiMapStore);
	Assert(RelationIsValid(visiMapStore->visimapRelation));
	Assert(RelationIsValid(visiMapStore->visimapIndex));
	Assert(indexScan);

	tuple = AppendOnlyVisimapStore_GetNextTuple(visiMapStore, indexScan, scanDirection);
	if (tuple == NULL)
	{
		return false;
	}
	heapTupleDesc = RelationGetDescr(visiMapStore->visimapRelation);
	if (visiMapEntry)
	{
		AppendOnlyVisimapEntry_Copyout(visiMapEntry, tuple,
			heapTupleDesc);
	}
	if (tupleTid)
	{
		ItemPointerCopy(&tuple->t_self, tupleTid);
	}
	return true;
}
/*
 * add a task, it will be performed on all segments.
 */
void SharedStorageOpAddTask(const char * relname, RelFileNode *node,
                            int32 segno, ItemPointer persistentTid,
                            int64 persistentSerialNum,
                            SharedStorageOpTasks *tasks) {
  Assert(NULL != node && NULL != tasks);
  Assert(tasks->sizeTasks >= tasks->numTasks);

  RelFileNode *n;

  if (tasks->sizeTasks == tasks->numTasks) {
    tasks->tasks = repalloc(tasks->tasks,
                            tasks->sizeTasks * sizeof(SharedStorageOpTask) * 2);
    tasks->sizeTasks *= 2;
  }

  n = &tasks->tasks[tasks->numTasks].node;
  n->dbNode = node->dbNode;
  n->relNode = node->relNode;
  n->spcNode = node->spcNode;

  tasks->tasks[tasks->numTasks].segno = segno;
  tasks->tasks[tasks->numTasks].relname = palloc(strlen(relname) + 1);
  strcpy(tasks->tasks[tasks->numTasks].relname, relname);
  ItemPointerCopy(persistentTid, &tasks->tasks[tasks->numTasks].persistentTid);
  tasks->tasks[tasks->numTasks].persistentSerialNum = persistentSerialNum;

  tasks->numTasks++;
}
Esempio n. 7
0
static bool PersistentStore_GetFreeTuple(
	PersistentStoreData 		*storeData,
	PersistentStoreSharedData 	*storeSharedData,
	ItemPointer				freeTid)
{
	ItemPointerData		previousFreeTid;

	MemSet(freeTid, 0, sizeof(ItemPointerData));

	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_GetFreeTuple: Enter: maximum free order number " INT64_FORMAT ", free TID %s ('%s')",
			 storeSharedData->maxFreeOrderNum, 
			 ItemPointerToString(&storeSharedData->freeTid),
			 storeData->tableName);

	if (storeSharedData->maxFreeOrderNum == 0)
	{
		return false;	/* No free tuples. */
	}

	if (gp_persistent_skip_free_list)
	{
		if (Debug_persistent_store_print)
			elog(PersistentStore_DebugPrintLevel(), 
				 "PersistentStore_GetFreeTuple: Skipping because gp_persistent_skip_free_list GUC is ON ('%s')",
				 storeData->tableName);
		return false;	/* Pretend no free tuples. */
	}

	Assert(storeSharedData->freeTid.ip_posid != 0);

	if (!PersistentStore_ValidateFreeTID(
								storeData,
								storeSharedData,
								&previousFreeTid))
		return false;

	*freeTid = storeSharedData->freeTid;
	storeSharedData->maxFreeOrderNum--;
	ItemPointerCopy(&previousFreeTid /* previousFreeTid set inside the ValidateFreeTID function */,
					&storeSharedData->freeTid);

	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_GetFreeTuple: Exit: maximum free order number " INT64_FORMAT ", free TID %s ('%s')",
			 storeSharedData->maxFreeOrderNum, 
			 ItemPointerToString(&storeSharedData->freeTid),
			 storeData->tableName);

	if (validate_previous_free_tid &&
		!PersistentStore_ValidateFreeTID(
										storeData,
										storeSharedData,
										&previousFreeTid))
		return false;

	return true;
}
Esempio n. 8
0
/*
 * update a tuple in in-memory heap table.
 *
 * if the target tuple already in the memory,
 * update it in-place with flag INMEM_HEAP_TUPLE_UPDATED.
 * else report an error.
 *
 * update should not change the otid of the old tuple,
 * since updated tuple should write back to the master and update there.
 */
void
InMemHeap_Update(InMemHeapRelation relation, ItemPointer otid,
        HeapTuple tup)
{
    int pos;
    HeapTuple target;
    MemoryContext oldmem = CurrentMemoryContext;

    Assert(ItemPointerIsValid(otid));

    pos = InMemHeap_Find(relation, otid);

    CurrentMemoryContext = relation->memcxt;

    /*
     * not found, report error
     */
    if (pos >= relation->tupsize)
    {
        ereport(ERROR,
                (errcode(ERRCODE_INTERNAL_ERROR),
                        errmsg("update a tuple which does not exist,"
                                " relname = %s, relid = %u", relation->rel->rd_rel->relname.data,
                                relation->relid)));
    }

    Insist(relation->hashIndex == NULL && "cannot handle index in in-memory heap when update");

    /*
     * already in table
     */
    Assert(relation->tuples[pos].flags == INMEM_HEAP_TUPLE_DISPATCHED
            || relation->tuples[pos].flags == INMEM_HEAP_TUPLE_UPDATED);
    relation->tuples[pos].flags = INMEM_HEAP_TUPLE_UPDATED;

    target = heaptuple_copy_to(tup, NULL, NULL );

    /*
     * do not modify original tuple header
     */
    ItemPointerCopy(&target->t_self, &relation->tuples[pos].tuple->t_self);

    Assert(ItemPointerEquals(&target->t_self, otid));

    memcpy(target->t_data, relation->tuples[pos].tuple->t_data,
            sizeof(HeapTupleHeaderData));

    CurrentMemoryContext = oldmem;

    pfree(relation->tuples[pos].tuple);
    relation->tuples[pos].tuple = target;
}
void
PersistentTablespace_RemoveSegment(int16 dbid, bool ismirror)
{
	TablespaceDirEntry tablespaceDirEntry;
	HASH_SEQ_STATUS hstat;

	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	hash_seq_init(&hstat, persistentTablespaceSharedHashTable);

	if (Persistent_BeforePersistenceWork())
		elog(ERROR, "persistent table changes forbidden");

	PersistentTablespace_VerifyInitScan();

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	LWLockAcquire(TablespaceHashLock, LW_SHARED);
	while ((tablespaceDirEntry = hash_seq_search(&hstat)) != NULL)
	{
		PersistentFileSysObjName fsObjName;
		Oid			tblspc = tablespaceDirEntry->key.tablespaceOid;
		ItemPointerData persistentTid;
		uint64		persistentSerialNum;

		tablespaceDirEntry = PersistentTablespace_FindEntryUnderLock(tblspc);

		LWLockRelease(TablespaceHashLock);

		if (tablespaceDirEntry == NULL)
			elog(ERROR, "Did not find persistent tablespace entry %u",
				 tblspc);
		persistentSerialNum = tablespaceDirEntry->persistentSerialNum;
		ItemPointerCopy(&tablespaceDirEntry->persistentTid, &persistentTid);

		PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tblspc);

		PersistentFileSysObj_RemoveSegment(&fsObjName,
										   &persistentTid,
										   persistentSerialNum,
										   dbid,
										   ismirror,
										    /* flushToXlog */ false);
		LWLockAcquire(TablespaceHashLock, LW_SHARED);
	}
	LWLockRelease(TablespaceHashLock);

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;
}
Esempio n. 10
0
Datum
currtid_byrelname(PG_FUNCTION_ARGS)
{
	text	   *relname = PG_GETARG_TEXT_P(0);
	ItemPointer tid = PG_GETARG_ITEMPOINTER(1);
	ItemPointer result;
	RangeVar   *relrv;
	Relation	rel;
	AclResult	aclresult;

	/*
	 * Immediately inform client that the function is not supported
	 */

	elog(ERROR, "Function currtid2 is not supported by GPDB");

	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
	rel = heap_openrv(relrv, AccessShareLock);

	aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(),
								  ACL_SELECT);
	if (aclresult != ACLCHECK_OK)
		aclcheck_error(aclresult, ACL_KIND_CLASS,
					   RelationGetRelationName(rel));

	if (rel->rd_rel->relkind == RELKIND_VIEW)
		return currtid_for_view(rel, tid);

	result = (ItemPointer) palloc(sizeof(ItemPointerData));
	ItemPointerCopy(tid, result);

	heap_get_latest_tid(rel, SnapshotNow, result);

	heap_close(rel, AccessShareLock);

	PG_RETURN_ITEMPOINTER(result);
}
Esempio n. 11
0
Datum
currtid_byrelname(PG_FUNCTION_ARGS)
{
	text	   *relname = PG_GETARG_TEXT_P(0);
	ItemPointer tid = PG_GETARG_ITEMPOINTER(1);
	ItemPointer result;
	RangeVar   *relrv;
	Relation	rel;

	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname,
												   "currtid_byrelname"));
	rel = heap_openrv(relrv, AccessShareLock);
	if (rel->rd_rel->relkind == RELKIND_VIEW)
		return currtid_for_view(rel, tid);

	result = (ItemPointer) palloc(sizeof(ItemPointerData));
	ItemPointerCopy(tid, result);

	heap_get_latest_tid(rel, SnapshotNow, result);

	heap_close(rel, AccessShareLock);

	PG_RETURN_ITEMPOINTER(result);
}
Esempio n. 12
0
/*
 * Search the relation 'rel' for tuple using the index.
 *
 * If a matching tuple is found, lock it with lockmode, fill the slot with its
 * contents, and return true.  Return false otherwise.
 */
bool
RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
							 LockTupleMode lockmode,
							 TupleTableSlot *searchslot,
							 TupleTableSlot *outslot)
{
	HeapTuple	scantuple;
	ScanKeyData skey[INDEX_MAX_KEYS];
	IndexScanDesc scan;
	SnapshotData snap;
	TransactionId xwait;
	Relation	idxrel;
	bool		found;

	/* Open the index. */
	idxrel = index_open(idxoid, RowExclusiveLock);

	/* Start an index scan. */
	InitDirtySnapshot(snap);
	scan = index_beginscan(rel, idxrel, &snap,
						   RelationGetNumberOfAttributes(idxrel),
						   0);

	/* Build scan key. */
	build_replindex_scan_key(skey, rel, idxrel, searchslot);

retry:
	found = false;

	index_rescan(scan, skey, RelationGetNumberOfAttributes(idxrel), NULL, 0);

	/* Try to find the tuple */
	if ((scantuple = index_getnext(scan, ForwardScanDirection)) != NULL)
	{
		found = true;
		ExecStoreTuple(scantuple, outslot, InvalidBuffer, false);
		ExecMaterializeSlot(outslot);

		xwait = TransactionIdIsValid(snap.xmin) ?
			snap.xmin : snap.xmax;

		/*
		 * If the tuple is locked, wait for locking transaction to finish and
		 * retry.
		 */
		if (TransactionIdIsValid(xwait))
		{
			XactLockTableWait(xwait, NULL, NULL, XLTW_None);
			goto retry;
		}
	}

	/* Found tuple, try to lock it in the lockmode. */
	if (found)
	{
		Buffer		buf;
		HeapUpdateFailureData hufd;
		HTSU_Result res;
		HeapTupleData locktup;

		ItemPointerCopy(&outslot->tts_tuple->t_self, &locktup.t_self);

		PushActiveSnapshot(GetLatestSnapshot());

		res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false),
							  lockmode,
							  LockWaitBlock,
							  false /* don't follow updates */ ,
							  &buf, &hufd);
		/* the tuple slot already has the buffer pinned */
		ReleaseBuffer(buf);

		PopActiveSnapshot();

		switch (res)
		{
			case HeapTupleMayBeUpdated:
				break;
			case HeapTupleUpdated:
				/* XXX: Improve handling here */
				ereport(LOG,
						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
						 errmsg("concurrent update, retrying")));
				goto retry;
			case HeapTupleInvisible:
				elog(ERROR, "attempted to lock invisible tuple");
			default:
				elog(ERROR, "unexpected heap_lock_tuple status: %u", res);
				break;
		}
	}

	index_endscan(scan);

	/* Don't release lock until commit. */
	index_close(idxrel, NoLock);

	return found;
}
void
PersistentTablespace_AddCreated(
								Oid filespaceOid,
 /* The filespace where the tablespace lives. */

								Oid tablespaceOid,
 /* The tablespace OID to be added. */

								MirroredObjectExistenceState mirrorExistenceState,

								bool flushToXLog)
 /* When true, the XLOG record for this change will be flushed to disk. */

{
	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	PersistentFileSysObjName fsObjName;

	ItemPointerData persistentTid;
	int64		persistentSerialNum;
	TablespaceDirEntry tablespaceDirEntry;

	if (Persistent_BeforePersistenceWork())
	{
		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(),
				 "Skipping persistent tablespace %u because we are before persistence work",
				 tablespaceOid);

		return;

		/*
		 * The initdb process will load the persistent table once we out of
		 * bootstrap mode.
		 */
	}

	PersistentTablespace_VerifyInitScan();

	PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tablespaceOid);

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	PersistentTablespace_AddTuple(
								  filespaceOid,
								  tablespaceOid,
								  PersistentFileSysState_Created,
								   /* createMirrorDataLossTrackingSessionNum */ 0,
								  mirrorExistenceState,
								   /* reserved */ 0,
								  InvalidTransactionId,
								  flushToXLog,
								  &persistentTid,
								  &persistentSerialNum);

	WRITE_TABLESPACE_HASH_LOCK;
	tablespaceDirEntry =
		PersistentTablespace_CreateEntryUnderLock(filespaceOid, tablespaceOid);
	Assert(tablespaceDirEntry != NULL);
	tablespaceDirEntry->state = PersistentFileSysState_Created;
	ItemPointerCopy(&persistentTid, &tablespaceDirEntry->persistentTid);
	tablespaceDirEntry->persistentSerialNum = persistentSerialNum;
	WRITE_TABLESPACE_HASH_UNLOCK;

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
			 "Persistent tablespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID '%s' ",
			 PersistentFileSysObjName_ObjectName(&fsObjName),
			 MirroredObjectExistenceState_Name(mirrorExistenceState),
			 persistentSerialNum,
			 ItemPointerToString(&persistentTid));
}
/*
 * Indicate we intend to create a tablespace file as part of the current transaction.
 *
 * An XLOG IntentToCreate record is generated that will guard the subsequent file-system
 * create in case the transaction aborts.
 *
 * After 1 or more calls to this routine to mark intention about tablespace files that are going
 * to be created, call ~_DoPendingCreates to do the actual file-system creates.  (See its
 * note on XLOG flushing).
 */
void
PersistentTablespace_MarkCreatePending(
									   Oid filespaceOid,
 /* The filespace where the tablespace lives. */

									   Oid tablespaceOid,
 /* The tablespace OID for the create. */

									   MirroredObjectExistenceState mirrorExistenceState,

									   ItemPointer persistentTid,
 /* TID of the gp_persistent_rel_files tuple for the rel file */

									   int64 *persistentSerialNum,


									   bool flushToXLog)
 /* When true, the XLOG record for this change will be flushed to disk. */

{
	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	PersistentFileSysObjName fsObjName;

	TablespaceDirEntry tablespaceDirEntry;
	TransactionId topXid;

	if (Persistent_BeforePersistenceWork())
	{
		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(),
				 "Skipping persistent tablespace %u because we are before persistence work",
				 tablespaceOid);

		return;

		/*
		 * The initdb process will load the persistent table once we out of
		 * bootstrap mode.
		 */
	}

	PersistentTablespace_VerifyInitScan();

	PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tablespaceOid);

	topXid = GetTopTransactionId();

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	PersistentTablespace_AddTuple(
								  filespaceOid,
								  tablespaceOid,
								  PersistentFileSysState_CreatePending,
								   /* createMirrorDataLossTrackingSessionNum */ 0,
								  mirrorExistenceState,
								   /* reserved */ 0,
								   /* parentXid */ topXid,
								  flushToXLog,
								  persistentTid,
								  persistentSerialNum);

	WRITE_TABLESPACE_HASH_LOCK;
	tablespaceDirEntry =
		PersistentTablespace_CreateEntryUnderLock(filespaceOid, tablespaceOid);
	Assert(tablespaceDirEntry != NULL);
	tablespaceDirEntry->state = PersistentFileSysState_CreatePending;
	ItemPointerCopy(persistentTid, &tablespaceDirEntry->persistentTid);
	tablespaceDirEntry->persistentSerialNum = *persistentSerialNum;
	WRITE_TABLESPACE_HASH_UNLOCK;

	/*
	 * This XLOG must be generated under the persistent write-lock.
	 */
#ifdef MASTER_MIRROR_SYNC
	mmxlog_log_create_tablespace(
								 filespaceOid,
								 tablespaceOid);
#endif

	SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteTablespaceEntry);

	/*
	 * MPP-18228 To make adding 'Create Pending' entry to persistent table and
	 * adding to the PendingDelete list atomic
	 */
	PendingDelete_AddCreatePendingEntryWrapper(
											   &fsObjName,
											   persistentTid,
											   *persistentSerialNum);

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
			 "Persistent tablespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID %s",
			 PersistentFileSysObjName_ObjectName(&fsObjName),
			 MirroredObjectExistenceState_Name(mirrorExistenceState),
			 *persistentSerialNum,
			 ItemPointerToString(persistentTid));
}
Esempio n. 15
0
/*
 * _bt_mergeload - Merge two streams of index tuples into new index files.
 */
static void
_bt_mergeload(Spooler *self, BTWriteState *wstate, BTSpool *btspool, BTReader *btspool2, Relation heapRel)
{
	BTPageState	   *state = NULL;
	IndexTuple		itup,
					itup2;
	bool			should_free = false;
	TupleDesc		tupdes = RelationGetDescr(wstate->index);
	int				keysz = RelationGetNumberOfAttributes(wstate->index);
	ScanKey			indexScanKey;
	ON_DUPLICATE	on_duplicate = self->on_duplicate;

	Assert(btspool != NULL);

	/* the preparation of merge */
	itup = BTSpoolGetNextItem(btspool, NULL, &should_free);
	itup2 = BTReaderGetNextItem(btspool2);
	indexScanKey = _bt_mkscankey_nodata(wstate->index);

	for (;;)
	{
		bool	load1 = true;		/* load BTSpool next ? */
		bool	hasnull;
		int32	compare;

		if (self->dup_old + self->dup_new > self->max_dup_errors)
			ereport(ERROR,
					(errcode(ERRCODE_INTERNAL_ERROR),
					 errmsg("Maximum duplicate error count exceeded")));

		if (itup2 == NULL)
		{
			if (itup == NULL)
				break;
		}
		else if (itup != NULL)
		{
			compare = compare_indextuple(itup, itup2, indexScanKey,
										 keysz, tupdes, &hasnull);

			if (compare == 0 && !hasnull && btspool->isunique)
			{
				ItemPointerData t_tid2;

				/*
				 * t_tid is update by heap_is_visible(), because use it for an
				 * index, t_tid backup
				 */
				ItemPointerCopy(&itup2->t_tid, &t_tid2);

				/* The tuple pointed by the old index should not be visible. */
				if (!heap_is_visible(heapRel, &itup->t_tid))
				{
					itup = BTSpoolGetNextItem(btspool, itup, &should_free);
				}
				else if (!heap_is_visible(heapRel, &itup2->t_tid))
				{
					itup2 = BTReaderGetNextItem(btspool2);
				}
				else
				{
					if (on_duplicate == ON_DUPLICATE_KEEP_NEW)
					{
						self->dup_old++;
						remove_duplicate(self, heapRel, itup2,
							RelationGetRelationName(wstate->index));
						itup2 = BTReaderGetNextItem(btspool2);
					}
					else
					{
						ItemPointerCopy(&t_tid2, &itup2->t_tid);
						self->dup_new++;
						remove_duplicate(self, heapRel, itup,
							RelationGetRelationName(wstate->index));
						itup = BTSpoolGetNextItem(btspool, itup, &should_free);
					}
				}

				continue;
			}
			else if (compare > 0)
				load1 = false;
		}
		else
			load1 = false;

		BULKLOAD_PROFILE(&prof_merge_unique);

		/* When we see first tuple, create first index page */
		if (state == NULL)
			state = _bt_pagestate(wstate, 0);

		if (load1)
		{
			IndexTuple	next_itup = NULL;
			bool		next_should_free = false;

			for (;;)
			{
				/* get next item */
				next_itup = BTSpoolGetNextItem(btspool, next_itup,
											   &next_should_free);

				if (!btspool->isunique || next_itup == NULL)
					break;

				compare = compare_indextuple(itup, next_itup, indexScanKey,
											 keysz, tupdes, &hasnull);
				if (compare < 0 || hasnull)
					break;

				if (compare > 0)
				{
					/* shouldn't happen */
					elog(ERROR, "faild in tuplesort_performsort");
				}

				/*
				 * If tupple is deleted by other unique indexes, not visible
				 */
				if (!heap_is_visible(heapRel, &next_itup->t_tid))
				{
					continue;
				}

				if (!heap_is_visible(heapRel, &itup->t_tid))
				{
					if (should_free)
						pfree(itup);

					itup = next_itup;
					should_free = next_should_free;
					next_should_free = false;
					continue;
				}

				/* not unique between input files */
				self->dup_new++;
				remove_duplicate(self, heapRel, next_itup,
								 RelationGetRelationName(wstate->index));

				if (self->dup_old + self->dup_new > self->max_dup_errors)
					ereport(ERROR,
							(errcode(ERRCODE_INTERNAL_ERROR),
							 errmsg("Maximum duplicate error count exceeded")));
			}

			_bt_buildadd(wstate, state, itup);

			if (should_free)
				pfree(itup);

			itup = next_itup;
			should_free = next_should_free;
		}
		else
		{
			_bt_buildadd(wstate, state, itup2);
			itup2 = BTReaderGetNextItem(btspool2);
		}
		BULKLOAD_PROFILE(&prof_merge_insert);
	}
	_bt_freeskey(indexScanKey);

	/* Close down final pages and write the metapage */
	_bt_uppershutdown(wstate, state);

	/*
	 * If the index isn't temp, we must fsync it down to disk before it's safe
	 * to commit the transaction.  (For a temp index we don't care since the
	 * index will be uninteresting after a crash anyway.)
	 *
	 * It's obvious that we must do this when not WAL-logging the build. It's
	 * less obvious that we have to do it even if we did WAL-log the index
	 * pages.  The reason is that since we're building outside shared buffers,
	 * a CHECKPOINT occurring during the build has no way to flush the
	 * previously written data to disk (indeed it won't know the index even
	 * exists).  A crash later on would replay WAL from the checkpoint,
	 * therefore it wouldn't replay our earlier WAL entries. If we do not
	 * fsync those pages here, they might still not be on disk when the crash
	 * occurs.
	 */
	if (!RELATION_IS_LOCAL(wstate->index))
	{
		RelationOpenSmgr(wstate->index);
		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
	}
	BULKLOAD_PROFILE(&prof_merge_term);
}
Esempio n. 16
0
void PersistentStore_FreeTuple(
	PersistentStoreData 		*storeData,
	PersistentStoreSharedData 	*storeSharedData,
	ItemPointer 			persistentTid,
				/* TID of the stored tuple. */
	Datum					*freeValues,
	bool					flushToXLog)
				/* When true, the XLOG record for this change will be flushed to disk. */
{
	Relation	persistentRel;
	HeapTuple	persistentTuple = NULL;
	ItemPointerData prevFreeTid;
	XLogRecPtr xlogEndLoc;
				/* The end location of the UPDATE XLOG record. */

	Assert( LWLockHeldByMe(PersistentObjLock) );
				
#ifdef USE_ASSERT_CHECKING
	if (storeSharedData == NULL ||
		!PersistentStoreSharedData_EyecatcherIsValid(storeSharedData))
		elog(ERROR, "Persistent store shared-memory not valid");
#endif
				
	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_FreeTuple: Going to free tuple at TID %s ('%s', shared data %p)",
			 ItemPointerToString(persistentTid),
			 storeData->tableName,
			 storeSharedData);
	
	Assert(persistentTid->ip_posid != 0);

	persistentRel = (*storeData->openRel)();

	prevFreeTid = storeSharedData->freeTid;
	if (validate_previous_free_tid)
	{
		/* Let us validate and have sanity check to make sure the prevFreeTid is really free. */
		ItemPointerData tmpPrevFreeTid;
		PersistentStore_ValidateFreeTID(
								storeData,
								storeSharedData,
								&tmpPrevFreeTid);
	}

	storeSharedData->maxFreeOrderNum++;
	if (storeSharedData->maxFreeOrderNum == 1)
		ItemPointerCopy(persistentTid, &prevFreeTid);  /* So non-zero PreviousFreeTid indicates free. */

	storeSharedData->freeTid = *persistentTid;

	PersistentStore_FormTupleSetOurs(
							storeData,
							persistentRel->rd_att,
							freeValues,
							storeSharedData->maxFreeOrderNum,
							&prevFreeTid,
							&persistentTuple);

	persistentTuple->t_self = *persistentTid;
		
	frozen_heap_inplace_update(persistentRel, persistentTuple);

	/*
	 * XLOG location of the UPDATE tuple's XLOG record.
	 */
	xlogEndLoc = XLogLastInsertEndLoc();

	heap_freetuple(persistentTuple);

	(*storeData->closeRel)(persistentRel);

	storeSharedData->inUseCount--;

	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_FreeTuple: Freed tuple at TID %s.  Maximum free order number " INT64_FORMAT ", in use count " INT64_FORMAT " ('%s')",
			 ItemPointerToString(&storeSharedData->freeTid),
			 storeSharedData->maxFreeOrderNum, 
			 storeSharedData->inUseCount,
			 storeData->tableName);

	if (flushToXLog)
	{
		XLogFlush(xlogEndLoc);
		XLogRecPtr_Zero(&nowaitXLogEndLoc);
	}
	else
		nowaitXLogEndLoc = xlogEndLoc;
}
Esempio n. 17
0
/*
 * MultiRelease -- release a multi-level lock
 *
 * Returns: TRUE if successful, FALSE otherwise.
 */
bool
MultiRelease(LockTableId tableId,
	     LOCKTAG *tag,
	     LOCKT	lockt,
	     LOCK_LEVEL level)
{
    LOCKT 	locks[N_LEVELS];
    int		i,status;
    LOCKTAG 	xxTag, *tmpTag = &xxTag;
    
    /* 
     * same level scheme as MultiAcquire().
     */
    switch (level) {
    case RELN_LEVEL:
	locks[0] = lockt;
	locks[1] = NO_LOCK;
	locks[2] = NO_LOCK;
	break;
    case PAGE_LEVEL:
	locks[0] = lockt + INTENT;
	locks[1] = lockt;
	locks[2] = NO_LOCK;
	break;
    case TUPLE_LEVEL:
	locks[0] = lockt + INTENT;
	locks[1] = lockt + INTENT;
	locks[2] = lockt;
	break;
    default:
	elog(WARN,"MultiRelease: bad lockt");
    }
    
    /*
     * again, construct the tag on the fly.  This time, however,
     * we release the locks in the REVERSE order -- from lowest
     * level to highest level.  
     *
     * Must zero out the tag to set padding byes to zero and ensure
     * hashing consistency.
     */
    memset(tmpTag, 0, sizeof(*tmpTag));
    tmpTag->relId = tag->relId;
    tmpTag->dbId =  tag->dbId;
    
    for (i=(N_LEVELS-1); i>=0; i--) {
	if (locks[i] != NO_LOCK) {
	    switch (i) {
	    case RELN_LEVEL:
		/* -------------
		 * Set the block # and offset to invalid
		 * -------------
		 */
		BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
		break;
	    case PAGE_LEVEL:
		/* -------------
		 * Copy the block #, set the offset to invalid
		 * -------------
		 */
		BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
			    &(tag->tupleId.ip_blkid));
		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
		break;
	    case TUPLE_LEVEL:
		ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
		break;
	    }
	    status = LockRelease(tableId, tmpTag, locks[i]);
	    if (! status) {
		elog(WARN,"MultiRelease: couldn't release after error");
	    }
	}
    }
    /* shouldn't reach here */
    return false;
}
Esempio n. 18
0
/*
 * MultiAcquire -- acquire multi level lock at requested level
 *
 * Returns: TRUE if lock is set, FALSE if not
 * Side Effects:
 */
bool
MultiAcquire(LockTableId tableId,
	     LOCKTAG *tag,
	     LOCKT lockt,
	     LOCK_LEVEL level)
{
    LOCKT locks[N_LEVELS];
    int	i,status;
    LOCKTAG 	xxTag, *tmpTag = &xxTag;
    int	retStatus = TRUE;
    
    /*
     * Three levels implemented.  If we set a low level (e.g. Tuple)
     * lock, we must set INTENT locks on the higher levels.  The 
     * intent lock detects conflicts between the low level lock
     * and an existing high level lock.  For example, setting a
     * write lock on a tuple in a relation is disallowed if there
     * is an existing read lock on the entire relation.  The
     * write lock would set a WRITE + INTENT lock on the relation
     * and that lock would conflict with the read.
     */
    switch (level) {
    case RELN_LEVEL:
	locks[0] = lockt;
	locks[1] = NO_LOCK;
	locks[2] = NO_LOCK;
	break;
    case PAGE_LEVEL:
	locks[0] = lockt + INTENT;
	locks[1] = lockt;
	locks[2] = NO_LOCK;
	break;
    case TUPLE_LEVEL:
	locks[0] = lockt + INTENT;
	locks[1] = lockt + INTENT;
	locks[2] = lockt;
	break;
    default:
	elog(WARN,"MultiAcquire: bad lock level");
	return(FALSE);
    }
    
    /*
     * construct a new tag as we go. Always loop through all levels,
     * but if we arent' seting a low level lock, locks[i] is set to
     * NO_LOCK for the lower levels.  Always start from the highest
     * level and go to the lowest level. 
     */
    memset(tmpTag,0,sizeof(*tmpTag));
    tmpTag->relId = tag->relId;
    tmpTag->dbId = tag->dbId;
    
    for (i=0;i<N_LEVELS;i++) {
	if (locks[i] != NO_LOCK) {
	    switch (i) {
	    case RELN_LEVEL:
		/* -------------
		 * Set the block # and offset to invalid
		 * -------------
		 */
		BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
		break;
	    case PAGE_LEVEL:
		/* -------------
		 * Copy the block #, set the offset to invalid
		 * -------------
		 */
		BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
			    &(tag->tupleId.ip_blkid));
		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
		break;
	    case TUPLE_LEVEL:
		/* --------------
		 * Copy the entire tuple id.
		 * --------------
		 */
		ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
		break;
	    }
	    
	    status = LockAcquire(tableId, tmpTag, locks[i]);
	    if (! status) {
		/* failed for some reason. Before returning we have
		 * to release all of the locks we just acquired.
		 * MultiRelease(xx,xx,xx, i) means release starting from
		 * the last level lock we successfully acquired
		 */
		retStatus = FALSE;
		(void) MultiRelease(tableId, tag, lockt, i);
		/* now leave the loop.  Don't try for any more locks */
		break;
	    }
	}
    }
    return(retStatus);
}
Esempio n. 19
0
/*
 * Rebuild free TID list based on freeEntryHashTable.  Returns number
 * of free tuples in the rebuilt free list.
 */
uint64
PersistentStore_RebuildFreeList(
	PersistentStoreData 		*storeData,
	PersistentStoreSharedData 	*storeSharedData)
{
	Datum				*values;
	PersistentStoreScan storeScan;
	ItemPointerData		persistentTid;
	ItemPointerData		previousFreeTid;
	ItemPointerData		previousTid;
	uint64				persistentSerialNum;
	uint64				freeOrderNum;


	values = (Datum*)palloc(storeData->numAttributes * sizeof(Datum));

	/*
	 * PT shared data must be already initialized, even when we are
	 * called during recovery.
	 */
	Assert(!PersistentStore_IsZeroTid(&storeSharedData->maxTid));

	if (storeSharedData->maxFreeOrderNum < 1)
	{
		elog(LOG, "no free tuples in %s, not building any free list",
			 storeData->tableName);
		return 0;
	}
	elog(LOG, "rebuilding free list in %s with " INT64_FORMAT " free tuples",
		 storeData->tableName, storeSharedData->maxFreeOrderNum);

	/*
	 * Scan PT for free entries (in TID order) and establish links
	 * with previous free entry as we go on.
	 */
	previousTid.ip_posid = 0;
	freeOrderNum = 0;
	PersistentStore_BeginScan(storeData, storeSharedData, &storeScan);
	while (PersistentStore_GetNext(
				   &storeScan,
				   values,
				   &persistentTid,
				   (int64 *)&persistentSerialNum))
	{
		/*
		 * We are scanning from low to high TID.  All TIDs we
		 * encounter should be smaller or equal to the known
		 * maxTid.
		 */
		Assert(ItemPointerCompare(
					   &storeSharedData->maxTid,
					   &persistentTid) >= 0);

		PersistentStore_ExtractOurTupleData(
				storeData,
				values,
				(int64 *)&persistentSerialNum,
				&previousFreeTid);

		if (!PersistentStore_IsZeroTid(&previousFreeTid))
		{
			values[storeData->attNumPersistentSerialNum - 1] =
					Int64GetDatum(++freeOrderNum);
			values[storeData->attNumPreviousFreeTid - 1] =
					ItemPointerIsValid(&previousTid) ?
					PointerGetDatum(&previousTid) :
					PointerGetDatum(&persistentTid);
#ifdef FAULT_INJECTOR
			/*
			 * Inject fault after free list is partially built - a few
			 * tuples are updated but at least one is yet to be
			 * updated.
			 */
			if (freeOrderNum > 3)
			{
				FaultInjector_InjectFaultIfSet(
						RebuildPTDB,
						DDLNotSpecified,
						"",	// databaseName
						""); // tableName
			}
#endif
			PersistentStore_UpdateTuple(
					storeData, storeSharedData,	&persistentTid, values, true);
			ItemPointerCopy(&persistentTid, &previousTid);
		}
	}
	PersistentStore_EndScan(&storeScan);
	pfree(values);
	if (ItemPointerIsValid(&previousTid))
	{
		Assert(freeOrderNum > 0);
		ItemPointerCopy(&previousTid, &storeSharedData->freeTid);
		storeSharedData->maxFreeOrderNum = freeOrderNum;
		elog(LOG, "rebuilt free list in %s:  maxFreeOrderNum = " INT64_FORMAT
			 " freeTid = %s", storeData->tableName, freeOrderNum,
			 ItemPointerToString(&persistentTid));
	}
	return freeOrderNum;
}
Esempio n. 20
0
static ArrayTuple
gistVacuumUpdate(GistVacuum *gv, BlockNumber blkno, bool needunion)
{
	ArrayTuple	res = {NULL, 0, false};
	Buffer		buffer;
	Page		page,
				tempPage = NULL;
	OffsetNumber i,
				maxoff;
	ItemId		iid;
	int			lenaddon = 4,
				curlenaddon = 0,
				nOffToDelete = 0,
				nBlkToDelete = 0;
	IndexTuple	idxtuple,
			   *addon = NULL;
	bool		needwrite = false;
	OffsetNumber offToDelete[MaxOffsetNumber];
	BlockNumber blkToDelete[MaxOffsetNumber];
	ItemPointerData *completed = NULL;
	int			ncompleted = 0,
				lencompleted = 16;

	vacuum_delay_point();

	buffer = ReadBufferWithStrategy(gv->index, blkno, gv->strategy);
	LockBuffer(buffer, GIST_EXCLUSIVE);
	gistcheckpage(gv->index, buffer);
	page = (Page) BufferGetPage(buffer);
	maxoff = PageGetMaxOffsetNumber(page);

	if (GistPageIsLeaf(page))
	{
		if (GistTuplesDeleted(page))
			needunion = needwrite = true;
	}
	else
	{
		completed = (ItemPointerData *) palloc(sizeof(ItemPointerData) * lencompleted);
		addon = (IndexTuple *) palloc(sizeof(IndexTuple) * lenaddon);

		/* get copy of page to work */
		tempPage = GistPageGetCopyPage(page);

		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
		{
			ArrayTuple	chldtuple;
			bool		needchildunion;

			iid = PageGetItemId(tempPage, i);
			idxtuple = (IndexTuple) PageGetItem(tempPage, iid);
			needchildunion = (GistTupleIsInvalid(idxtuple)) ? true : false;

			if (needchildunion)
				elog(DEBUG2, "gistVacuumUpdate: need union for block %u",
					 ItemPointerGetBlockNumber(&(idxtuple->t_tid)));

			chldtuple = gistVacuumUpdate(gv, ItemPointerGetBlockNumber(&(idxtuple->t_tid)),
										 needchildunion);
			if (chldtuple.ituplen || chldtuple.emptypage)
			{
				/* update tuple or/and inserts new */
				if (chldtuple.emptypage)
					blkToDelete[nBlkToDelete++] = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
				offToDelete[nOffToDelete++] = i;
				PageIndexTupleDelete(tempPage, i);
				i--;
				maxoff--;
				needwrite = needunion = true;

				if (chldtuple.ituplen)
				{

					Assert(chldtuple.emptypage == false);
					while (curlenaddon + chldtuple.ituplen >= lenaddon)
					{
						lenaddon *= 2;
						addon = (IndexTuple *) repalloc(addon, sizeof(IndexTuple) * lenaddon);
					}

					memcpy(addon + curlenaddon, chldtuple.itup, chldtuple.ituplen * sizeof(IndexTuple));

					curlenaddon += chldtuple.ituplen;

					if (chldtuple.ituplen > 1)
					{
						/*
						 * child was split, so we need mark completion
						 * insert(split)
						 */
						int			j;

						while (ncompleted + chldtuple.ituplen > lencompleted)
						{
							lencompleted *= 2;
							completed = (ItemPointerData *) repalloc(completed, sizeof(ItemPointerData) * lencompleted);
						}
						for (j = 0; j < chldtuple.ituplen; j++)
						{
							ItemPointerCopy(&(chldtuple.itup[j]->t_tid), completed + ncompleted);
							ncompleted++;
						}
					}
					pfree(chldtuple.itup);
				}
			}
		}

		Assert(maxoff == PageGetMaxOffsetNumber(tempPage));

		if (curlenaddon)
		{
			/* insert updated tuples */
			if (gistnospace(tempPage, addon, curlenaddon, InvalidOffsetNumber, 0))
			{
				/* there is no space on page to insert tuples */
				res = vacuumSplitPage(gv, tempPage, buffer, addon, curlenaddon);
				tempPage = NULL;	/* vacuumSplitPage() free tempPage */
				needwrite = needunion = false;	/* gistSplit already forms
												 * unions and writes pages */
			}
			else
				/* enough free space */
				gistfillbuffer(gv->index, tempPage, addon, curlenaddon, InvalidOffsetNumber);
		}
	}

	/*
	 * If page is empty, we should remove pointer to it before deleting page
	 * (except root)
	 */

	if (blkno != GIST_ROOT_BLKNO && (PageIsEmpty(page) || (tempPage && PageIsEmpty(tempPage))))
	{
		/*
		 * New version of page is empty, so leave it unchanged, upper call
		 * will mark our page as deleted. In case of page split we never will
		 * be here...
		 *
		 * If page was empty it can't become non-empty during processing
		 */
		res.emptypage = true;
		UnlockReleaseBuffer(buffer);
	}
	else
	{
		/* write page and remove its childs if it need */

		START_CRIT_SECTION();

		if (tempPage && needwrite)
		{
			PageRestoreTempPage(tempPage, page);
			tempPage = NULL;
		}

		/* Empty index */
		if (PageIsEmpty(page) && blkno == GIST_ROOT_BLKNO)
		{
			needwrite = true;
			GistPageSetLeaf(page);
		}


		if (needwrite)
		{
			MarkBufferDirty(buffer);
			GistClearTuplesDeleted(page);

			if (!gv->index->rd_istemp)
			{
				XLogRecData *rdata;
				XLogRecPtr	recptr;
				char	   *xlinfo;

				rdata = formUpdateRdata(gv->index->rd_node, buffer,
										offToDelete, nOffToDelete,
										addon, curlenaddon, NULL);
				xlinfo = rdata->next->data;

				recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata);
				PageSetLSN(page, recptr);
				PageSetTLI(page, ThisTimeLineID);

				pfree(xlinfo);
				pfree(rdata);
			}
			else
				PageSetLSN(page, XLogRecPtrForTemp);
		}

		END_CRIT_SECTION();

		if (needunion && !PageIsEmpty(page))
		{
			res.itup = (IndexTuple *) palloc(sizeof(IndexTuple));
			res.ituplen = 1;
			res.itup[0] = PageMakeUnionKey(gv, buffer);
		}

		UnlockReleaseBuffer(buffer);

		/* delete empty children, now we havn't any links to pointed subtrees */
		for (i = 0; i < nBlkToDelete; i++)
			gistDeleteSubtree(gv, blkToDelete[i]);

		if (ncompleted && !gv->index->rd_istemp)
			gistxlogInsertCompletion(gv->index->rd_node, completed, ncompleted);
	}


	for (i = 0; i < curlenaddon; i++)
		pfree(addon[i]);
	if (addon)
		pfree(addon);
	if (completed)
		pfree(completed);
	if (tempPage)
		pfree(tempPage);

	return res;
}
Esempio n. 21
0
/*
 * extract_minipage
 *
 * Extract the minipage info from the given tuple. The tupleTid
 * is also set here.
 */
static void
extract_minipage(AppendOnlyBlockDirectory *blockDirectory,
				 HeapTuple tuple,
				 TupleDesc tupleDesc,
				 int columnGroupNo)
{
	Datum *values = blockDirectory->values;
	bool *nulls = blockDirectory->nulls;
	MinipagePerColumnGroup *minipageInfo =
		&blockDirectory->minipages[columnGroupNo];
	FileSegInfo *fsInfo = blockDirectory->currentSegmentFileInfo;
	int64 eof;
	int start, end, mid=0;
	bool found = false;
	
	heap_deform_tuple(tuple, tupleDesc, values, nulls);

	Assert(blockDirectory->currentSegmentFileNum ==
		   DatumGetInt32(values[Anum_pg_aoblkdir_segno - 1]));

	/*
	 * Copy out the minipage
	 */
	copy_out_minipage(minipageInfo,
					  values[Anum_pg_aoblkdir_minipage - 1],
					  nulls[Anum_pg_aoblkdir_minipage - 1]);

	ItemPointerCopy(&tuple->t_self, &minipageInfo->tupleTid);

	/*
	 * When crashes during inserts, or cancellation during inserts,
	 * there are out-of-date minipage entries in the block directory.
	 * We reset those entries here.
	 */
	Assert(fsInfo != NULL);
	if (!blockDirectory->isAOCol)
		eof = fsInfo->eof;
	else
		eof = ((AOCSFileSegInfo *)fsInfo)->vpinfo.entry[columnGroupNo].eof;

	start = 0;
	end = minipageInfo->numMinipageEntries - 1;
	while (start <= end)
	{
		mid = (end - start + 1) / 2 + start;
		if (minipageInfo->minipage->entry[mid].fileOffset > eof)
			end = mid - 1;
		else if (minipageInfo->minipage->entry[mid].fileOffset < eof)
			start = mid + 1;
		else
		{
			found = true;
			break;
		}
	}

	minipageInfo->numMinipageEntries = 0;
	if (found)
		minipageInfo->numMinipageEntries = mid;
	else if (start > 0)
	{
		minipageInfo->numMinipageEntries = start;
		Assert(minipageInfo->minipage->entry[start - 1].fileOffset < eof);
	}
}
Esempio n. 22
0
/*
 * Add a mirror.
 */
void
PersistentFilespace_AddMirror(Oid filespace,
							  char *mirpath,
							  int16 pridbid, int16 mirdbid,
							  bool set_mirror_existence)
{
	PersistentFileSysObjName fsObjName;
	char *newpath;
	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	FilespaceDirEntry fde;
	ItemPointerData persistentTid;
	int64 persistentSerialNum;

	if (Persistent_BeforePersistenceWork())
		elog(ERROR, "persistent table changes forbidden");

	PersistentFilespace_VerifyInitScan();

	PersistentFileSysObjName_SetFilespaceDir(&fsObjName, filespace);

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	LWLockAcquire(FilespaceHashLock, LW_SHARED);
	fde = PersistentFilespace_FindDirUnderLock(filespace);
	if (fde == NULL)
		elog(ERROR, "did not find persistent filespace entry %u",
			 filespace);

	if (fde->dbId1 == pridbid)
	{
		fde->dbId2 = mirdbid;
		PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded2,
										mirpath);
		newpath = fde->locationBlankPadded2;
	}
	else if (fde->dbId2 == pridbid)
	{
		fde->dbId1 = mirdbid;
		PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded1,
										mirpath);
		newpath = fde->locationBlankPadded1;
	}
	else
	{
		Insist(false);
	}

	ItemPointerCopy(&fde->persistentTid, &persistentTid);
	persistentSerialNum = fde->persistentSerialNum;
	LWLockRelease(FilespaceHashLock);

	PersistentFileSysObj_AddMirror(&fsObjName,
								   &persistentTid,
								   persistentSerialNum,
								   pridbid,
								   mirdbid,
								   (void *)newpath,
								   set_mirror_existence,
								   /* flushToXlog */ false);

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;
}
Esempio n. 23
0
/*
 * Activate a standby master by removing reference to the dead master
 * and changing our dbid to the old master's dbid
 */
void
PersistentFilespace_ActivateStandby(int16 oldmaster, int16 newmaster)
{
	HASH_SEQ_STATUS hstat;
	FilespaceDirEntry fde;
	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	if (Persistent_BeforePersistenceWork())
		elog(ERROR, "persistent table changes forbidden");

	hash_seq_init(&hstat, persistentFilespaceSharedHashTable);

	PersistentFilespace_VerifyInitScan();

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	/*
	 * We release FilespaceHashLock in the middle of the loop and re-acquire
	 * it after doing persistent table change.  This is needed to prevent
	 * holding the lock for any purpose other than to protect the filespace
	 * shared hash table.  Not releasing this lock could result in file I/O
	 * and potential deadlock due to other LW locks being acquired in the
	 * process.  Releasing the lock this way is safe because we are still
	 * holding PersistentObjLock in exclusive mode.  Any change to the
	 * filespace shared hash table is also protected by PersistentObjLock.
	 */
	WRITE_FILESPACE_HASH_LOCK;
	while ((fde = hash_seq_search(&hstat)) != NULL)
	{
		Oid filespace = fde->key.filespaceOid;
		PersistentFileSysObjName fsObjName;
		ItemPointerData persistentTid;
		int64 persistentSerialNum = fde->persistentSerialNum;

		ItemPointerCopy(&fde->persistentTid, &persistentTid);

		PersistentFileSysObjName_SetFilespaceDir(&fsObjName, filespace);

		if (fde->dbId1 == oldmaster)
		{
			fde->dbId1 = InvalidDbid;
			fde->dbId2 = newmaster;

			/* Copy standby filespace location into new master location */
			PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded2,
										fde->locationBlankPadded1);

			PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded1,
										"");
		}
		else if (fde->dbId2 == oldmaster)
		{
			fde->dbId2 = InvalidDbid;
			fde->dbId1 = newmaster;

			/* Copy standby filespace location into new master location */
			PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded1,
										fde->locationBlankPadded2);

			PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded2,
										"");
		}
		WRITE_FILESPACE_HASH_UNLOCK;

		PersistentFileSysObj_ActivateStandby(&fsObjName,
								   &persistentTid,
								   persistentSerialNum,
								   oldmaster,
								   newmaster,
								   /* flushToXlog */ false);
		WRITE_FILESPACE_HASH_LOCK;
	}
	WRITE_FILESPACE_HASH_UNLOCK;

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;
}
Esempio n. 24
0
/*
 * Indicate we intend to create a filespace file as part of the current transaction.
 *
 * An XLOG IntentToCreate record is generated that will guard the subsequent file-system
 * create in case the transaction aborts.
 *
 * After 1 or more calls to this routine to mark intention about filespace files that are going
 * to be created, call ~_DoPendingCreates to do the actual file-system creates.  (See its
 * note on XLOG flushing).
 */
void PersistentFilespace_MarkCreatePending(
	Oid 		filespaceOid,
				/* The filespace where the filespace lives. */

	int16		primaryDbId,

	char 		*primaryFilespaceLocation,
				/*
				 * The primary filespace directory path.  NOT Blank padded.
				 * Just a NULL terminated string.
				 */

	int16		mirrorDbId,

	char 		*mirrorFilespaceLocation,

	MirroredObjectExistenceState mirrorExistenceState,

	ItemPointer		persistentTid,
				/* TID of the gp_persistent_rel_files tuple for the rel file */

	int64			*persistentSerialNum,


	bool			flushToXLog)
				/* When true, the XLOG record for this change will be flushed to disk. */

{
	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	PersistentFileSysObjName fsObjName;

	FilespaceDirEntry filespaceDirEntry;
	TransactionId topXid;
	Datum values[Natts_gp_persistent_filespace_node];
	char mirrorFilespaceLocationBlankPadded[FilespaceLocationBlankPaddedWithNullTermLen];
	char primaryFilespaceLocationBlankPadded[FilespaceLocationBlankPaddedWithNullTermLen];

	if (Persistent_BeforePersistenceWork())
	{
		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(),
				 "Skipping persistent filespace %u because we are before persistence work",
				 filespaceOid);

		return;	// The initdb process will load the persistent table once we out of bootstrap mode.
	}

	PersistentFilespace_VerifyInitScan();

	PersistentFileSysObjName_SetFilespaceDir(&fsObjName,filespaceOid);

	topXid = GetTopTransactionId();

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	PersistentFilespace_BlankPadCopyLocation(
										primaryFilespaceLocationBlankPadded,
										primaryFilespaceLocation);
	
	PersistentFilespace_BlankPadCopyLocation(
										mirrorFilespaceLocationBlankPadded,
										mirrorFilespaceLocation);

	GpPersistentFilespaceNode_SetDatumValues(
										values,
										filespaceOid,
										primaryDbId,
										primaryFilespaceLocationBlankPadded,
										mirrorDbId,
										mirrorFilespaceLocationBlankPadded,
										PersistentFileSysState_CreatePending,
										/* createMirrorDataLossTrackingSessionNum */ 0,
										mirrorExistenceState,
										/* reserved */ 0,
										/* parentXid */ topXid,
										/* persistentSerialNum */ 0);	// This will be set by PersistentFileSysObj_AddTuple.

	PersistentFileSysObj_AddTuple(
							PersistentFsObjType_FilespaceDir,
							values,
							flushToXLog,
							persistentTid,
							persistentSerialNum);


	WRITE_FILESPACE_HASH_LOCK;

	filespaceDirEntry =	PersistentFilespace_CreateDirUnderLock(filespaceOid);

	Assert(filespaceDirEntry != NULL);

	filespaceDirEntry->dbId1 = primaryDbId;
	memcpy(filespaceDirEntry->locationBlankPadded1, primaryFilespaceLocationBlankPadded,
		   FilespaceLocationBlankPaddedWithNullTermLen);
	
	filespaceDirEntry->dbId2 = mirrorDbId;
	memcpy(filespaceDirEntry->locationBlankPadded2, mirrorFilespaceLocationBlankPadded,
		   FilespaceLocationBlankPaddedWithNullTermLen);

	filespaceDirEntry->state = PersistentFileSysState_CreatePending;
	ItemPointerCopy(persistentTid, &filespaceDirEntry->persistentTid);
	filespaceDirEntry->persistentSerialNum = *persistentSerialNum;

	WRITE_FILESPACE_HASH_UNLOCK;

	/*
	 * This XLOG must be generated under the persistent write-lock.
	 */
#ifdef MASTER_MIRROR_SYNC
	mmxlog_log_create_filespace(filespaceOid);
#endif

	SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteFilespaceEntry);

	/*
	 * MPP-18228
	 * To make adding 'Create Pending' entry to persistent table and adding
	 * to the PendingDelete list atomic
	 */
	PendingDelete_AddCreatePendingEntryWrapper(
								&fsObjName,
								persistentTid,
								*persistentSerialNum);

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
		     "Persistent filespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID %s",
			 PersistentFileSysObjName_ObjectName(&fsObjName),
			 MirroredObjectExistenceState_Name(mirrorExistenceState),
			 *persistentSerialNum,
			 ItemPointerToString(persistentTid));
}
Esempio n. 25
0
File: nbtree.c Progetto: LJoNe/gpdb
/*
 * Post vacuum, iterate over all entries in index, check if the h_tid
 * of each entry exists and is not dead.  For specific system tables,
 * also ensure that the key in index entry matches the corresponding
 * attribute in the heap tuple.
 */
void
_bt_validate_vacuum(Relation irel, Relation hrel, TransactionId oldest_xmin)
{
	MIRROREDLOCK_BUFMGR_DECLARE;

	BlockNumber blkno;
	BlockNumber num_pages;
	Buffer ibuf = InvalidBuffer;
	Buffer hbuf = InvalidBuffer;
	Page ipage;
	BTPageOpaque opaque;
	IndexTuple itup;
	HeapTupleData htup;
	OffsetNumber maxoff,
			minoff,
			offnum;
	Oid ioid,
		hoid;
	bool isnull;

	blkno = BTREE_METAPAGE + 1;
	num_pages = RelationGetNumberOfBlocks(irel);

	elog(LOG, "btvalidatevacuum: index %s, heap %s",
		 RelationGetRelationName(irel), RelationGetRelationName(hrel));

	MIRROREDLOCK_BUFMGR_LOCK;
	for (; blkno < num_pages; blkno++)
	{
		ibuf = ReadBuffer(irel, blkno);
		ipage = BufferGetPage(ibuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(ipage);
		if (!PageIsNew(ipage))
			_bt_checkpage(irel, ibuf);
		if (P_ISLEAF(opaque))
		{
			minoff = P_FIRSTDATAKEY(opaque);
			maxoff = PageGetMaxOffsetNumber(ipage);
			for (offnum = minoff;
				 offnum <= maxoff;
				 offnum = OffsetNumberNext(offnum))
			{
				itup = (IndexTuple) PageGetItem(ipage,
												PageGetItemId(ipage, offnum));
				ItemPointerCopy(&itup->t_tid, &htup.t_self);
				/*
				 * TODO: construct a tid bitmap based on index tids
				 * and fetch heap tids in order afterwards.  That will
				 * also allow validating if a heap tid appears twice
				 * in a unique index.
				 */
				if (!heap_release_fetch(hrel, SnapshotAny, &htup,
										&hbuf, true, NULL))
				{
					elog(ERROR, "btvalidatevacuum: tid (%d,%d) from index %s "
						 "not found in heap %s",
						 ItemPointerGetBlockNumber(&itup->t_tid),
						 ItemPointerGetOffsetNumber(&itup->t_tid),
						 RelationGetRelationName(irel),
						 RelationGetRelationName(hrel));
				}
				switch (HeapTupleSatisfiesVacuum(hrel, htup.t_data, oldest_xmin, hbuf))
				{
					case HEAPTUPLE_RECENTLY_DEAD:
					case HEAPTUPLE_LIVE:
					case HEAPTUPLE_INSERT_IN_PROGRESS:
					case HEAPTUPLE_DELETE_IN_PROGRESS:
						/* these tuples are considered alive by vacuum */
						break;
					case HEAPTUPLE_DEAD:
						elog(ERROR, "btvalidatevacuum: vacuum did not remove "
							 "dead tuple (%d,%d) from heap %s and index %s",
							 ItemPointerGetBlockNumber(&itup->t_tid),
							 ItemPointerGetOffsetNumber(&itup->t_tid),
							 RelationGetRelationName(hrel),
							 RelationGetRelationName(irel));
						break;
					default:
						elog(ERROR, "btvalidatevacuum: invalid visibility");
						break;
				}
				switch(RelationGetRelid(irel))
				{
					case DatabaseOidIndexId:
					case TypeOidIndexId:
					case ClassOidIndexId:
					case ConstraintOidIndexId:
						hoid = HeapTupleGetOid(&htup);
						ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull);
						if (hoid != ioid)
						{
							elog(ERROR,
								 "btvalidatevacuum: index oid(%d) != heap oid(%d)"
								 " tuple (%d,%d) index %s", ioid, hoid,
								 ItemPointerGetBlockNumber(&itup->t_tid),
								 ItemPointerGetOffsetNumber(&itup->t_tid),
								 RelationGetRelationName(irel));
						}
						break;
					case GpRelationNodeOidIndexId:
						hoid = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull);
						ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull);
						if (hoid != ioid)
						{
							elog(ERROR,
								 "btvalidatevacuum: index oid(%d) != heap oid(%d)"
								 " tuple (%d,%d) index %s", ioid, hoid,
								 ItemPointerGetBlockNumber(&itup->t_tid),
								 ItemPointerGetOffsetNumber(&itup->t_tid),
								 RelationGetRelationName(irel));
						}
						int4 hsegno = heap_getattr(&htup, 2, RelationGetDescr(hrel), &isnull);
						int4 isegno = index_getattr(itup, 2, RelationGetDescr(irel), &isnull);
						if (isegno != hsegno)
						{
							elog(ERROR,
								 "btvalidatevacuum: index segno(%d) != heap segno(%d)"
								 " tuple (%d,%d) index %s", isegno, hsegno,
								 ItemPointerGetBlockNumber(&itup->t_tid),
								 ItemPointerGetOffsetNumber(&itup->t_tid),
								 RelationGetRelationName(irel));
						}
						break;
					default:
						break;
				}
				if (RelationGetNamespace(irel) == PG_AOSEGMENT_NAMESPACE)
				{
					int4 isegno = index_getattr(itup, 1, RelationGetDescr(irel), &isnull);
					int4 hsegno = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull);
					if (isegno != hsegno)
					{
						elog(ERROR,
							 "btvalidatevacuum: index segno(%d) != heap segno(%d)"
							 " tuple (%d,%d) index %s", isegno, hsegno,
							 ItemPointerGetBlockNumber(&itup->t_tid),
							 ItemPointerGetOffsetNumber(&itup->t_tid),
							 RelationGetRelationName(irel));
					}
				}
			}
		}
		if (BufferIsValid(ibuf))
			ReleaseBuffer(ibuf);
	}
	if (BufferIsValid(hbuf))
		ReleaseBuffer(hbuf);
	MIRROREDLOCK_BUFMGR_UNLOCK;
}
Esempio n. 26
0
/*
 * Search the relation 'rel' for tuple using the sequential scan.
 *
 * If a matching tuple is found, lock it with lockmode, fill the slot with its
 * contents, and return true.  Return false otherwise.
 *
 * Note that this stops on the first matching tuple.
 *
 * This can obviously be quite slow on tables that have more than few rows.
 */
bool
RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode,
						 TupleTableSlot *searchslot, TupleTableSlot *outslot)
{
	HeapTuple	scantuple;
	HeapScanDesc scan;
	SnapshotData snap;
	TransactionId xwait;
	bool		found;
	TupleDesc	desc = RelationGetDescr(rel);

	Assert(equalTupleDescs(desc, outslot->tts_tupleDescriptor));

	/* Start an index scan. */
	InitDirtySnapshot(snap);
	scan = heap_beginscan(rel, &snap, 0, NULL);

retry:
	found = false;

	heap_rescan(scan, NULL);

	/* Try to find the tuple */
	while ((scantuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
	{
		if (!tuple_equals_slot(desc, scantuple, searchslot))
			continue;

		found = true;
		ExecStoreTuple(scantuple, outslot, InvalidBuffer, false);
		ExecMaterializeSlot(outslot);

		xwait = TransactionIdIsValid(snap.xmin) ?
			snap.xmin : snap.xmax;

		/*
		 * If the tuple is locked, wait for locking transaction to finish and
		 * retry.
		 */
		if (TransactionIdIsValid(xwait))
		{
			XactLockTableWait(xwait, NULL, NULL, XLTW_None);
			goto retry;
		}
	}

	/* Found tuple, try to lock it in the lockmode. */
	if (found)
	{
		Buffer		buf;
		HeapUpdateFailureData hufd;
		HTSU_Result res;
		HeapTupleData locktup;

		ItemPointerCopy(&outslot->tts_tuple->t_self, &locktup.t_self);

		PushActiveSnapshot(GetLatestSnapshot());

		res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false),
							  lockmode,
							  LockWaitBlock,
							  false /* don't follow updates */ ,
							  &buf, &hufd);
		/* the tuple slot already has the buffer pinned */
		ReleaseBuffer(buf);

		PopActiveSnapshot();

		switch (res)
		{
			case HeapTupleMayBeUpdated:
				break;
			case HeapTupleUpdated:
				/* XXX: Improve handling here */
				ereport(LOG,
						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
						 errmsg("concurrent update, retrying")));
				goto retry;
			case HeapTupleInvisible:
				elog(ERROR, "attempted to lock invisible tuple");
			default:
				elog(ERROR, "unexpected heap_lock_tuple status: %u", res);
				break;
		}
	}

	heap_endscan(scan);

	return found;
}