Beispiel #1
0
static void PersistentStore_DoInsertTuple(
	PersistentStoreData 		*storeData,
	PersistentStoreSharedData 	*storeSharedData,
	Relation				persistentRel,
				/* The persistent table relation. */
	Datum					*values,
	bool					flushToXLog,
				/* When true, the XLOG record for this change will be flushed to disk. */
	ItemPointer 			persistentTid)
				/* TID of the stored tuple. */
{
	bool 		*nulls;
	HeapTuple	persistentTuple = NULL;
	XLogRecPtr	xlogInsertEndLoc;

	/*
	 * In order to keep the tuples the exact same size to enable direct reuse of
	 * free tuples, we do not use NULLs.
	 */
	nulls = (bool*)palloc0(storeData->numAttributes * sizeof(bool));
		
	/*
	 * Form the tuple.
	 */
	persistentTuple = heap_form_tuple(persistentRel->rd_att, values, nulls);
	if (!HeapTupleIsValid(persistentTuple))
		elog(ERROR, "Failed to build persistent tuple ('%s')",
		     storeData->tableName);

	frozen_heap_insert(
					persistentRel,
					persistentTuple);

	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_DoInsertTuple: new insert TID %s ('%s')",
			 ItemPointerToString2(&persistentTuple->t_self),
			 storeData->tableName);
	
	/*
	 * Return the TID of the INSERT tuple.
	 * Return the XLOG location of the INSERT tuple's XLOG record.
	 */
	*persistentTid = persistentTuple->t_self;
		
	xlogInsertEndLoc = XLogLastInsertEndLoc();

	heap_freetuple(persistentTuple);

	if (flushToXLog)
	{
		XLogFlush(xlogInsertEndLoc);
		XLogRecPtr_Zero(&nowaitXLogEndLoc);
	}
	else
		nowaitXLogEndLoc = xlogInsertEndLoc;

	pfree(nulls);

}
Beispiel #2
0
/*
 * Execute the CREATE BARRIER command. Write a BARRIER WAL record and flush the
 * WAL buffers to disk before returning to the caller. Writing the WAL record
 * does not guarantee successful completion of the barrier command.
 */
void
ProcessCreateBarrierExecute(const char *id)
{
	StringInfoData buf;

	if (!IsConnFromCoord())
		ereport(ERROR,
				(errcode(ERRCODE_INTERNAL_ERROR),
				 errmsg("The CREATE BARRIER EXECUTE message is expected to "
						"arrive from a Coordinator")));
	{
		XLogRecData rdata[1];
		XLogRecPtr recptr;

		rdata[0].data = (char *) id;
		rdata[0].len = strlen(id) + 1;
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = NULL;

		recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE, rdata);
		XLogFlush(recptr);
	}

	pq_beginmessage(&buf, 'b');
	pq_sendstring(&buf, id);
	pq_endmessage(&buf);
	pq_flush();
}
Beispiel #3
0
finish_sync_worker(void)
{
	/*
	 * Commit any outstanding transaction. This is the usual case, unless
	 * there was nothing to do for the table.
	 */
	if (IsTransactionState())
	{
		CommitTransactionCommand();
		pgstat_report_stat(false);
	}

	/* And flush all writes. */
	XLogFlush(GetXLogWriteRecPtr());

	StartTransactionCommand();
	ereport(LOG,
			(errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has finished",
					MySubscription->name,
					get_rel_name(MyLogicalRepWorker->relid))));
	CommitTransactionCommand();

	/* Find the main apply worker and signal it. */
	logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid);

	/* Stop gracefully */
	proc_exit(0);
}
void PersistentStore_FlushXLog(void)
{
	if (nowaitXLogEndLoc.xlogid != 0 ||
		nowaitXLogEndLoc.xrecoff != 0)
	{
		XLogFlush(nowaitXLogEndLoc);
		XLogRecPtr_Zero(&nowaitXLogEndLoc);
	}
}
Beispiel #5
0
/*
 * Write a TRUNCATE xlog record
 *
 * We must flush the xlog record to disk before returning --- see notes
 * in DistributedLog_Truncate().
 *
 * Note: xlog record is marked as outside transaction control, since we
 * want it to be redone whether the invoking transaction commits or not.
 */
static void
DistributedLog_WriteTruncateXlogRec(int page)
{
	XLogRecData rdata;
	XLogRecPtr	recptr;

	rdata.data = (char *) (&page);
	rdata.len = sizeof(int);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	recptr = XLogInsert(RM_DISTRIBUTEDLOG_ID, DISTRIBUTEDLOG_TRUNCATE | XLOG_NO_TRAN, &rdata);
	XLogFlush(recptr);
}
Beispiel #6
0
void PersistentStore_FreeTuple(
	PersistentStoreData 		*storeData,
	PersistentStoreSharedData 	*storeSharedData,
	ItemPointer 			persistentTid,
				/* TID of the stored tuple. */
	Datum					*freeValues,
	bool					flushToXLog)
				/* When true, the XLOG record for this change will be flushed to disk. */
{
	Relation	persistentRel;
	XLogRecPtr xlogEndLoc;
				/* The end location of the UPDATE XLOG record. */

	Assert( LWLockHeldByMe(PersistentObjLock) );
				
#ifdef USE_ASSERT_CHECKING
	if (storeSharedData == NULL ||
		!PersistentStoreSharedData_EyecatcherIsValid(storeSharedData))
		elog(ERROR, "Persistent store shared-memory not valid");
#endif
				
	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_FreeTuple: Going to free tuple at TID %s ('%s', shared data %p)",
			 ItemPointerToString(persistentTid),
			 storeData->tableName,
			 storeSharedData);
	
	Assert(ItemPointerIsValid(persistentTid));

	persistentRel = (*storeData->openRel)();
	simple_heap_delete_xid(persistentRel, persistentTid, FrozenTransactionId);
	/*
	 * XLOG location of the UPDATE tuple's XLOG record.
	 */
	xlogEndLoc = XLogLastInsertEndLoc();

	(*storeData->closeRel)(persistentRel);

	storeSharedData->inUseCount--;

	if (flushToXLog)
	{
		XLogFlush(xlogEndLoc);
		XLogRecPtr_Zero(&nowaitXLogEndLoc);
	}
	else
		nowaitXLogEndLoc = xlogEndLoc;
}
Beispiel #7
0
/*
 * Write out a new shared or local map file with the given contents.
 *
 * The magic number and CRC are automatically updated in *newmap.  On
 * success, we copy the data to the appropriate permanent static variable.
 *
 * If write_wal is TRUE then an appropriate WAL message is emitted.
 * (It will be false for bootstrap and WAL replay cases.)
 *
 * If send_sinval is TRUE then a SI invalidation message is sent.
 * (This should be true except in bootstrap case.)
 *
 * If preserve_files is TRUE then the storage manager is warned not to
 * delete the files listed in the map.
 *
 * Because this may be called during WAL replay when MyDatabaseId,
 * DatabasePath, etc aren't valid, we require the caller to pass in suitable
 * values.	The caller is also responsible for being sure no concurrent
 * map update could be happening.
 */
static void
write_relmap_file(bool shared, RelMapFile *newmap,
				  bool write_wal, bool send_sinval, bool preserve_files,
				  Oid dbid, Oid tsid, const char *dbpath)
{
	int			fd;
	RelMapFile *realmap;
	char		mapfilename[MAXPGPATH];

	/*
	 * Fill in the overhead fields and update CRC.
	 */
	newmap->magic = RELMAPPER_FILEMAGIC;
	if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
		elog(ERROR, "attempt to write bogus relation mapping");

	INIT_CRC32(newmap->crc);
	COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
	FIN_CRC32(newmap->crc);

	/*
	 * Open the target file.  We prefer to do this before entering the
	 * critical section, so that an open() failure need not force PANIC.
	 */
	if (shared)
	{
		snprintf(mapfilename, sizeof(mapfilename), "global/%s",
				 RELMAPPER_FILENAME);
		realmap = &shared_map;
	}
	else
	{
		snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
				 dbpath, RELMAPPER_FILENAME);
		realmap = &local_map;
	}

	fd = OpenTransientFile(mapfilename,
						   O_WRONLY | O_CREAT | PG_BINARY,
						   S_IRUSR | S_IWUSR);
	if (fd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not open relation mapping file \"%s\": %m",
						mapfilename)));

	if (write_wal)
	{
		xl_relmap_update xlrec;
		XLogRecData rdata[2];
		XLogRecPtr	lsn;

		/* now errors are fatal ... */
		START_CRIT_SECTION();

		xlrec.dbid = dbid;
		xlrec.tsid = tsid;
		xlrec.nbytes = sizeof(RelMapFile);

		rdata[0].data = (char *) (&xlrec);
		rdata[0].len = MinSizeOfRelmapUpdate;
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = &(rdata[1]);
		rdata[1].data = (char *) newmap;
		rdata[1].len = sizeof(RelMapFile);
		rdata[1].buffer = InvalidBuffer;
		rdata[1].next = NULL;

		lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata);

		/* As always, WAL must hit the disk before the data update does */
		XLogFlush(lsn);
	}

	errno = 0;
	if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write to relation mapping file \"%s\": %m",
						mapfilename)));
	}

	/*
	 * We choose to fsync the data to disk before considering the task done.
	 * It would be possible to relax this if it turns out to be a performance
	 * issue, but it would complicate checkpointing --- see notes for
	 * CheckPointRelationMap.
	 */
	if (pg_fsync(fd) != 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not fsync relation mapping file \"%s\": %m",
						mapfilename)));

	if (CloseTransientFile(fd))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not close relation mapping file \"%s\": %m",
						mapfilename)));

	/*
	 * Now that the file is safely on disk, send sinval message to let other
	 * backends know to re-read it.  We must do this inside the critical
	 * section: if for some reason we fail to send the message, we have to
	 * force a database-wide PANIC.  Otherwise other backends might continue
	 * execution with stale mapping information, which would be catastrophic
	 * as soon as others began to use the now-committed data.
	 */
	if (send_sinval)
		CacheInvalidateRelmap(dbid);

	/*
	 * Make sure that the files listed in the map are not deleted if the outer
	 * transaction aborts.	This had better be within the critical section
	 * too: it's not likely to fail, but if it did, we'd arrive at transaction
	 * abort with the files still vulnerable.  PANICing will leave things in a
	 * good state on-disk.
	 *
	 * Note: we're cheating a little bit here by assuming that mapped files
	 * are either in pg_global or the database's default tablespace.
	 */
	if (preserve_files)
	{
		int32		i;

		for (i = 0; i < newmap->num_mappings; i++)
		{
			RelFileNode rnode;

			rnode.spcNode = tsid;
			rnode.dbNode = dbid;
			rnode.relNode = newmap->mappings[i].mapfilenode;
			RelationPreserveStorage(rnode, false);
		}
	}

	/* Success, update permanent copy */
	memcpy(realmap, newmap, sizeof(RelMapFile));

	/* Critical section done */
	if (write_wal)
		END_CRIT_SECTION();
}
Beispiel #8
0
/*
 * Reserve WAL for the currently active slot.
 *
 * Compute and set restart_lsn in a manner that's appropriate for the type of
 * the slot and concurrency safe.
 */
void
ReplicationSlotReserveWal(void)
{
	ReplicationSlot *slot = MyReplicationSlot;

	Assert(slot != NULL);
	Assert(slot->data.restart_lsn == InvalidXLogRecPtr);

	/*
	 * The replication slot mechanism is used to prevent removal of required
	 * WAL. As there is no interlock between this routine and checkpoints, WAL
	 * segments could concurrently be removed when a now stale return value of
	 * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that
	 * this happens we'll just retry.
	 */
	while (true)
	{
		XLogSegNo	segno;

		/*
		 * For logical slots log a standby snapshot and start logical decoding
		 * at exactly that position. That allows the slot to start up more
		 * quickly.
		 *
		 * That's not needed (or indeed helpful) for physical slots as they'll
		 * start replay at the last logged checkpoint anyway. Instead return
		 * the location of the last redo LSN. While that slightly increases
		 * the chance that we have to retry, it's where a base backup has to
		 * start replay at.
		 */
		if (!RecoveryInProgress() && SlotIsLogical(slot))
		{
			XLogRecPtr	flushptr;

			/* start at current insert position */
			slot->data.restart_lsn = GetXLogInsertRecPtr();

			/* make sure we have enough information to start */
			flushptr = LogStandbySnapshot();

			/* and make sure it's fsynced to disk */
			XLogFlush(flushptr);
		}
		else
		{
			slot->data.restart_lsn = GetRedoRecPtr();
		}

		/* prevent WAL removal as fast as possible */
		ReplicationSlotsComputeRequiredLSN();

		/*
		 * If all required WAL is still there, great, otherwise retry. The
		 * slot should prevent further removal of WAL, unless there's a
		 * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
		 * the new restart_lsn above, so normally we should never need to loop
		 * more than twice.
		 */
		XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size);
		if (XLogGetLastRemovedSegno() < segno)
			break;
	}
}
/*
 * RelationTruncate
 *		Physically truncate a relation to the specified number of blocks.
 *
 * This includes getting rid of any buffers for the blocks that are to be
 * dropped.
 */
void
RelationTruncate(Relation rel, BlockNumber nblocks)
{
	bool		fsm;
	bool		vm;

	/* Open it at the smgr level if not already done */
	RelationOpenSmgr(rel);

	/*
	 * Make sure smgr_targblock etc aren't pointing somewhere past new end
	 */
	rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
	rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;

	/* Truncate the FSM first if it exists */
	fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
	if (fsm)
		FreeSpaceMapTruncateRel(rel, nblocks);

	/* Truncate the visibility map too if it exists. */
	vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
	if (vm)
		visibilitymap_truncate(rel, nblocks);

	/*
	 * We WAL-log the truncation before actually truncating, which means
	 * trouble if the truncation fails. If we then crash, the WAL replay
	 * likely isn't going to succeed in the truncation either, and cause a
	 * PANIC. It's tempting to put a critical section here, but that cure
	 * would be worse than the disease. It would turn a usually harmless
	 * failure to truncate, that might spell trouble at WAL replay, into a
	 * certain PANIC.
	 */
	if (!rel->rd_istemp)
	{
		/*
		 * Make an XLOG entry reporting the file truncation.
		 */
		XLogRecPtr	lsn;
		XLogRecData rdata;
		xl_smgr_truncate xlrec;

		xlrec.blkno = nblocks;
		xlrec.rnode = rel->rd_node;

		rdata.data = (char *) &xlrec;
		rdata.len = sizeof(xlrec);
		rdata.buffer = InvalidBuffer;
		rdata.next = NULL;

		lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE, &rdata);

		/*
		 * Flush, because otherwise the truncation of the main relation might
		 * hit the disk before the WAL record, and the truncation of the FSM
		 * or visibility map. If we crashed during that window, we'd be left
		 * with a truncated heap, but the FSM or visibility map would still
		 * contain entries for the non-existent heap pages.
		 */
		if (fsm || vm)
			XLogFlush(lsn);
	}

	/* Do the real work */
	smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks, rel->rd_istemp);
}
void PersistentStore_ReplaceTuple(
	PersistentStoreData 		*storeData,

	PersistentStoreSharedData 	*storeSharedData,

	ItemPointer 			persistentTid,
				/* TID of the stored tuple. */

	HeapTuple				tuple,

	Datum					*newValues,
	
	bool					*replaces,

	bool					flushToXLog)
				/* When true, the XLOG record for this change will be flushed to disk. */

{
	Relation	persistentRel;
	bool 		*nulls;
	HeapTuple	replacementTuple = NULL;
	XLogRecPtr 	xlogUpdateEndLoc;
	
#ifdef USE_ASSERT_CHECKING
	if (storeSharedData == NULL ||
		!PersistentStoreSharedData_EyecatcherIsValid(storeSharedData))
		elog(ERROR, "Persistent store shared-memory not valid");
#endif
	
	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_ReplaceTuple: Going to replace set of columns in tuple at TID %s ('%s', shared data %p)",
			 ItemPointerToString(persistentTid),
			 storeData->tableName,
			 storeSharedData);

	persistentRel = (*storeData->openRel)();

	/*
	 * In order to keep the tuples the exact same size to enable direct reuse of
	 * free tuples, we do not use NULLs.
	 */
	nulls = (bool*)palloc0(storeData->numAttributes * sizeof(bool));
		
	/*
	 * Modify the tuple.
	 */
	replacementTuple = heap_modify_tuple(tuple, persistentRel->rd_att, 
										 newValues, nulls, replaces);

	replacementTuple->t_self = *persistentTid;
		
	frozen_heap_inplace_update(persistentRel, replacementTuple);

	/*
	 * Return the XLOG location of the UPDATE tuple's XLOG record.
	 */
	xlogUpdateEndLoc = XLogLastInsertEndLoc();

	heap_freetuple(replacementTuple);
	pfree(nulls);

	if (Debug_persistent_store_print)
	{
		Datum 			*readValues;
		bool			*readNulls;
		HeapTupleData 	readTuple;
		Buffer			buffer;
		HeapTuple		readTupleCopy;
		
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_ReplaceTuple: Replaced set of columns in tuple at TID %s ('%s')",
			 ItemPointerToString(persistentTid),
			 storeData->tableName);
		
		readValues = (Datum*)palloc(storeData->numAttributes * sizeof(Datum));
		readNulls = (bool*)palloc(storeData->numAttributes * sizeof(bool));

		readTuple.t_self = *persistentTid;
		
		if (!heap_fetch(persistentRel, SnapshotAny,
						&readTuple, &buffer, false, NULL))
		{
			elog(ERROR, "Failed to fetch persistent tuple at %s ('%s')",
				 ItemPointerToString(&readTuple.t_self),
				 storeData->tableName);
		}
		
		
		readTupleCopy = heaptuple_copy_to(&readTuple, NULL, NULL);
		
		ReleaseBuffer(buffer);
		
		heap_deform_tuple(readTupleCopy, persistentRel->rd_att, readValues, readNulls);
		
		(*storeData->printTupleCallback)(
									PersistentStore_DebugPrintLevel(),
									"STORE REPLACED TUPLE",
									persistentTid,
									readValues);

		heap_freetuple(readTupleCopy);
		pfree(readValues);
		pfree(readNulls);
	}

	(*storeData->closeRel)(persistentRel);
	
	if (flushToXLog)
	{
		XLogFlush(xlogUpdateEndLoc);
		XLogRecPtr_Zero(&nowaitXLogEndLoc);
	}
	else
		nowaitXLogEndLoc = xlogUpdateEndLoc;
}
Beispiel #11
0
/*
 *	RecordTransactionAbortPrepared
 *
 * This is basically the same as RecordTransactionAbort.
 *
 * We know the transaction made at least one XLOG entry (its PREPARE),
 * so it is never possible to optimize out the abort record.
 */
static void
RecordTransactionAbortPrepared(TransactionId xid,
							   int nchildren,
							   TransactionId *children,
							   int nrels,
							   RelFileNode *rels)
{
	XLogRecData rdata[3];
	int			lastrdata = 0;
	xl_xact_abort_prepared xlrec;
	XLogRecPtr	recptr;

	/*
	 * Catch the scenario where we aborted partway through
	 * RecordTransactionCommitPrepared ...
	 */
	if (TransactionIdDidCommit(xid))
		elog(PANIC, "cannot abort transaction %u, it was already committed",
			 xid);

	START_CRIT_SECTION();

	/* Emit the XLOG abort record */
	xlrec.xid = xid;
	xlrec.arec.xact_time = GetCurrentTimestamp();
	xlrec.arec.nrels = nrels;
	xlrec.arec.nsubxacts = nchildren;
	rdata[0].data = (char *) (&xlrec);
	rdata[0].len = MinSizeOfXactAbortPrepared;
	rdata[0].buffer = InvalidBuffer;
	/* dump rels to delete */
	if (nrels > 0)
	{
		rdata[0].next = &(rdata[1]);
		rdata[1].data = (char *) rels;
		rdata[1].len = nrels * sizeof(RelFileNode);
		rdata[1].buffer = InvalidBuffer;
		lastrdata = 1;
	}
	/* dump committed child Xids */
	if (nchildren > 0)
	{
		rdata[lastrdata].next = &(rdata[2]);
		rdata[2].data = (char *) children;
		rdata[2].len = nchildren * sizeof(TransactionId);
		rdata[2].buffer = InvalidBuffer;
		lastrdata = 2;
	}
	rdata[lastrdata].next = NULL;

	recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT_PREPARED, rdata);

	/* Always flush, since we're about to remove the 2PC state file */
	XLogFlush(recptr);

	/*
	 * Mark the transaction aborted in clog.  This is not absolutely necessary
	 * but we may as well do it while we are here.
	 */
	TransactionIdAbortTree(xid, nchildren, children);

	END_CRIT_SECTION();
}
void PersistentStore_FreeTuple(
	PersistentStoreData 		*storeData,

	PersistentStoreSharedData 	*storeSharedData,

	ItemPointer 			persistentTid,
				/* TID of the stored tuple. */

	Datum					*freeValues,

	bool					flushToXLog)
				/* When true, the XLOG record for this change will be flushed to disk. */

{
	Relation	persistentRel;
	HeapTuple	persistentTuple = NULL;
	ItemPointerData prevFreeTid;
	XLogRecPtr xlogEndLoc;
				/* The end location of the UPDATE XLOG record. */
				
#ifdef USE_ASSERT_CHECKING
	if (storeSharedData == NULL ||
		!PersistentStoreSharedData_EyecatcherIsValid(storeSharedData))
		elog(ERROR, "Persistent store shared-memory not valid");
#endif
				
	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_FreeTuple: Going to free tuple at TID %s ('%s', shared data %p)",
			 ItemPointerToString(persistentTid),
			 storeData->tableName,
			 storeSharedData);
	
	Assert(persistentTid->ip_posid != 0);

	persistentRel = (*storeData->openRel)();

	storeSharedData->maxFreeOrderNum++;
	if (storeSharedData->maxFreeOrderNum == 1)
		prevFreeTid = *persistentTid;		// So non-zero PreviousFreeTid indicates free.
	else
		prevFreeTid = storeSharedData->freeTid;
	storeSharedData->freeTid = *persistentTid;

	PersistentStore_FormTupleSetOurs(
							storeData,
							persistentRel->rd_att,
							freeValues,
							storeSharedData->maxFreeOrderNum,
							&prevFreeTid,
							&persistentTuple);

	persistentTuple->t_self = *persistentTid;
		
	frozen_heap_inplace_update(persistentRel, persistentTuple);

	/*
	 * XLOG location of the UPDATE tuple's XLOG record.
	 */
	xlogEndLoc = XLogLastInsertEndLoc();

	heap_freetuple(persistentTuple);

	(*storeData->closeRel)(persistentRel);

	storeSharedData->inUseCount--;

	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_FreeTuple: Freed tuple at TID %s.  Maximum free order number " INT64_FORMAT ", in use count " INT64_FORMAT " ('%s')",
			 ItemPointerToString(&storeSharedData->freeTid),
			 storeSharedData->maxFreeOrderNum, 
			 storeSharedData->inUseCount,
			 storeData->tableName);

	if (flushToXLog)
	{
		XLogFlush(xlogEndLoc);
		XLogRecPtr_Zero(&nowaitXLogEndLoc);
	}
	else
		nowaitXLogEndLoc = xlogEndLoc;
}
Beispiel #13
0
/*
 *	RecordTransactionCommitPrepared
 *
 * This is basically the same as RecordTransactionCommit: in particular,
 * we must take the CheckpointStartLock to avoid a race condition.
 *
 * We know the transaction made at least one XLOG entry (its PREPARE),
 * so it is never possible to optimize out the commit record.
 */
static void
RecordTransactionCommitPrepared(TransactionId xid,
								int nchildren,
								TransactionId *children,
								int nrels,
								RelFileNode *rels)
{
	XLogRecData rdata[3];
	int			lastrdata = 0;
	xl_xact_commit_prepared xlrec;
	XLogRecPtr	recptr;

	START_CRIT_SECTION();

	/* See notes in RecordTransactionCommit */
	LWLockAcquire(CheckpointStartLock, LW_SHARED);

	/* Emit the XLOG commit record */
	xlrec.xid = xid;
	xlrec.crec.xtime = time(NULL);
	xlrec.crec.nrels = nrels;
	xlrec.crec.nsubxacts = nchildren;
	rdata[0].data = (char *) (&xlrec);
	rdata[0].len = MinSizeOfXactCommitPrepared;
	rdata[0].buffer = InvalidBuffer;
	/* dump rels to delete */
	if (nrels > 0)
	{
		rdata[0].next = &(rdata[1]);
		rdata[1].data = (char *) rels;
		rdata[1].len = nrels * sizeof(RelFileNode);
		rdata[1].buffer = InvalidBuffer;
		lastrdata = 1;
	}
	/* dump committed child Xids */
	if (nchildren > 0)
	{
		rdata[lastrdata].next = &(rdata[2]);
		rdata[2].data = (char *) children;
		rdata[2].len = nchildren * sizeof(TransactionId);
		rdata[2].buffer = InvalidBuffer;
		lastrdata = 2;
	}
	rdata[lastrdata].next = NULL;

	recptr = XLogInsert(RM_XACT_ID,
						XLOG_XACT_COMMIT_PREPARED | XLOG_NO_TRAN,
						rdata);

	/* we don't currently try to sleep before flush here ... */

	/* Flush XLOG to disk */
	XLogFlush(recptr);

	/* Mark the transaction committed in pg_clog */
	TransactionIdCommit(xid);
	/* to avoid race conditions, the parent must commit first */
	TransactionIdCommitTree(nchildren, children);

	/* Checkpoint is allowed again */
	LWLockRelease(CheckpointStartLock);

	END_CRIT_SECTION();
}
Beispiel #14
0
void
smgr_redo(XLogRecPtr lsn, XLogRecord *record)
{
	uint8		info = record->xl_info & ~XLR_INFO_MASK;

	/* Backup blocks are not used in smgr records */
	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));

	if (info == XLOG_SMGR_CREATE)
	{
		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
		SMgrRelation reln;

		reln = smgropen(xlrec->rnode, InvalidBackendId);
		smgrcreate(reln, xlrec->forkNum, true);
	}
	else if (info == XLOG_SMGR_TRUNCATE)
	{
		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
		SMgrRelation reln;
		Relation	rel;

		reln = smgropen(xlrec->rnode, InvalidBackendId);

		/*
		 * Forcibly create relation if it doesn't exist (which suggests that
		 * it was dropped somewhere later in the WAL sequence).  As in
		 * XLogReadBuffer, we prefer to recreate the rel and replay the log as
		 * best we can until the drop is seen.
		 */
		smgrcreate(reln, MAIN_FORKNUM, true);

		/*
		 * Before we perform the truncation, update minimum recovery point
		 * to cover this WAL record. Once the relation is truncated, there's
		 * no going back. The buffer manager enforces the WAL-first rule
		 * for normal updates to relation files, so that the minimum recovery
		 * point is always updated before the corresponding change in the
		 * data file is flushed to disk. We have to do the same manually
		 * here.
		 *
		 * Doing this before the truncation means that if the truncation fails
		 * for some reason, you cannot start up the system even after restart,
		 * until you fix the underlying situation so that the truncation will
		 * succeed. Alternatively, we could update the minimum recovery point
		 * after truncation, but that would leave a small window where the
		 * WAL-first rule could be violated.
		 */
		XLogFlush(lsn);

		smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno);

		/* Also tell xlogutils.c about it */
		XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);

		/* Truncate FSM and VM too */
		rel = CreateFakeRelcacheEntry(xlrec->rnode);

		if (smgrexists(reln, FSM_FORKNUM))
			FreeSpaceMapTruncateRel(rel, xlrec->blkno);
		if (smgrexists(reln, VISIBILITYMAP_FORKNUM))
			visibilitymap_truncate(rel, xlrec->blkno);

		FreeFakeRelcacheEntry(rel);
	}
	else
		elog(PANIC, "smgr_redo: unknown op code %u", info);
}
Beispiel #15
0
/*
 *	RecordTransactionCommit
 */
void
RecordTransactionCommit(void)
{
	/*
	 * If we made neither any XLOG entries nor any temp-rel updates, we
	 * can omit recording the transaction commit at all.
	 */
	if (MyXactMadeXLogEntry || MyXactMadeTempRelUpdate)
	{
		TransactionId xid = GetCurrentTransactionId();
		bool		madeTCentries;
		XLogRecPtr	recptr;

		/* Tell bufmgr and smgr to prepare for commit */
		BufmgrCommit();

		START_CRIT_SECTION();

		/*
		 * If our transaction made any transaction-controlled XLOG entries,
		 * we need to lock out checkpoint start between writing our XLOG
		 * record and updating pg_clog.  Otherwise it is possible for the
		 * checkpoint to set REDO after the XLOG record but fail to flush the
		 * pg_clog update to disk, leading to loss of the transaction commit
		 * if we crash a little later.  Slightly klugy fix for problem
		 * discovered 2004-08-10.
		 *
		 * (If it made no transaction-controlled XLOG entries, its XID
		 * appears nowhere in permanent storage, so no one else will ever care
		 * if it committed; so it doesn't matter if we lose the commit flag.)
		 *
		 * Note we only need a shared lock.
		 */
		madeTCentries = (MyLastRecPtr.xrecoff != 0);
		if (madeTCentries)
			LWLockAcquire(CheckpointStartLock, LW_SHARED);

		/*
		 * We only need to log the commit in XLOG if the transaction made
		 * any transaction-controlled XLOG entries.
		 */
		if (madeTCentries)
		{
			/* Need to emit a commit record */
			XLogRecData rdata;
			xl_xact_commit xlrec;

			xlrec.xtime = time(NULL);
			rdata.buffer = InvalidBuffer;
			rdata.data = (char *) (&xlrec);
			rdata.len = SizeOfXactCommit;
			rdata.next = NULL;

			/*
			 * XXX SHOULD SAVE ARRAY OF RELFILENODE-s TO DROP
			 */
			recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, &rdata);
		}
		else
		{
			/* Just flush through last record written by me */
			recptr = ProcLastRecEnd;
		}

		/*
		 * We must flush our XLOG entries to disk if we made any XLOG
		 * entries, whether in or out of transaction control.  For
		 * example, if we reported a nextval() result to the client, this
		 * ensures that any XLOG record generated by nextval will hit the
		 * disk before we report the transaction committed.
		 */
		if (MyXactMadeXLogEntry)
		{
			/*
			 * Sleep before flush! So we can flush more than one commit
			 * records per single fsync.  (The idea is some other backend
			 * may do the XLogFlush while we're sleeping.  This needs work
			 * still, because on most Unixen, the minimum select() delay
			 * is 10msec or more, which is way too long.)
			 *
			 * We do not sleep if enableFsync is not turned on, nor if there
			 * are fewer than CommitSiblings other backends with active
			 * transactions.
			 */
			if (CommitDelay > 0 && enableFsync &&
				CountActiveBackends() >= CommitSiblings)
			{
				struct timeval delay;

				delay.tv_sec = 0;
				delay.tv_usec = CommitDelay;
				(void) select(0, NULL, NULL, NULL, &delay);
			}

			XLogFlush(recptr);
		}

		/*
		 * We must mark the transaction committed in clog if its XID
		 * appears either in permanent rels or in local temporary rels. We
		 * test this by seeing if we made transaction-controlled entries
		 * *OR* local-rel tuple updates.  Note that if we made only the
		 * latter, we have not emitted an XLOG record for our commit, and
		 * so in the event of a crash the clog update might be lost.  This
		 * is okay because no one else will ever care whether we
		 * committed.
		 */
		if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate)
			TransactionIdCommit(xid);

		/* Unlock checkpoint lock if we acquired it */
		if (madeTCentries)
			LWLockRelease(CheckpointStartLock);

		END_CRIT_SECTION();
	}

	/* Break the chain of back-links in the XLOG records I output */
	MyLastRecPtr.xrecoff = 0;
	MyXactMadeXLogEntry = false;
	MyXactMadeTempRelUpdate = false;

	/* Show myself as out of the transaction in PGPROC array */
	MyProc->logRec.xrecoff = 0;
}
Beispiel #16
0
/*
 * Physical write of a page from a buffer slot
 *
 * On failure, we cannot just ereport(ERROR) since caller has put state in
 * shared memory that must be undone.  So, we return FALSE and save enough
 * info in static variables to let SlruReportIOError make the report.
 *
 * For now, assume it's not worth keeping a file pointer open across
 * independent read/write operations.  We do batch operations during
 * SimpleLruFlush, though.
 *
 * fdata is NULL for a standalone write, pointer to open-file info during
 * SimpleLruFlush.
 */
static bool
SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
{
	SlruShared	shared = ctl->shared;
	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
	int			offset = rpageno * BLCKSZ;
	char		path[MAXPGPATH];
	int			fd = -1;
	struct timeval tv;

	/*
	 * Honor the write-WAL-before-data rule, if appropriate, so that we do not
	 * write out data before associated WAL records.  This is the same action
	 * performed during FlushBuffer() in the main buffer manager.
	 */
	if (shared->group_lsn != NULL)
	{
		/*
		 * We must determine the largest async-commit LSN for the page. This
		 * is a bit tedious, but since this entire function is a slow path
		 * anyway, it seems better to do this here than to maintain a per-page
		 * LSN variable (which'd need an extra comparison in the
		 * transaction-commit path).
		 */
		XLogRecPtr	max_lsn;
		int			lsnindex,
					lsnoff;

		lsnindex = slotno * shared->lsn_groups_per_page;
		max_lsn = shared->group_lsn[lsnindex++];
		for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
		{
			XLogRecPtr	this_lsn = shared->group_lsn[lsnindex++];

			if (XLByteLT(max_lsn, this_lsn))
				max_lsn = this_lsn;
		}

		if (!XLogRecPtrIsInvalid(max_lsn))
		{
			/*
			 * As noted above, elog(ERROR) is not acceptable here, so if
			 * XLogFlush were to fail, we must PANIC.  This isn't much of a
			 * restriction because XLogFlush is just about all critical
			 * section anyway, but let's make sure.
			 */
			START_CRIT_SECTION();
			XLogFlush(max_lsn);
			END_CRIT_SECTION();
		}
	}

	/*
	 * During a Flush, we may already have the desired file open.
	 */
	if (fdata)
	{
		int			i;

		for (i = 0; i < fdata->num_files; i++)
		{
			if (fdata->segno[i] == segno)
			{
				fd = fdata->fd[i];
				break;
			}
		}
	}

	if (fd < 0)
	{
		/*
		 * If the file doesn't already exist, we should create it.  It is
		 * possible for this to need to happen when writing a page that's not
		 * first in its segment; we assume the OS can cope with that. (Note:
		 * it might seem that it'd be okay to create files only when
		 * SimpleLruZeroPage is called for the first page of a segment.
		 * However, if after a crash and restart the REDO logic elects to
		 * replay the log from a checkpoint before the latest one, then it's
		 * possible that we will get commands to set transaction status of
		 * transactions that have already been truncated from the commit log.
		 * Easiest way to deal with that is to accept references to
		 * nonexistent files here and in SlruPhysicalReadPage.)
		 *
		 * Note: it is possible for more than one backend to be executing this
		 * code simultaneously for different pages of the same file. Hence,
		 * don't use O_EXCL or O_TRUNC or anything like that.
		 */
		SlruFileName(ctl, path, segno);
		fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY,
						   S_IRUSR | S_IWUSR);
		if (fd < 0)
		{
			slru_errcause = SLRU_OPEN_FAILED;
			slru_errno = errno;
			return false;
		}

		if (fdata)
		{
			if (fdata->num_files < MAX_FLUSH_BUFFERS)
			{
				fdata->fd[fdata->num_files] = fd;
				fdata->segno[fdata->num_files] = segno;
				fdata->num_files++;
			}
			else
			{
				/*
				 * In the unlikely event that we exceed MAX_FLUSH_BUFFERS,
				 * fall back to treating it as a standalone write.
				 */
				fdata = NULL;
			}
		}
	}

	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
	{
		slru_errcause = SLRU_SEEK_FAILED;
		slru_errno = errno;
		if (!fdata)
			close(fd);
		return false;
	}

	errno = 0;
	if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
		slru_errcause = SLRU_WRITE_FAILED;
		slru_errno = errno;
		if (!fdata)
			close(fd);
		return false;
	}
#ifdef XP_TRACE_LRU_WRITE
	gettimeofday(&tv, NULL);
	ereport(TRACE_LEVEL,
		(errmsg("%ld.%ld:\tWRITE:\tSlruPhysicalWritePage:\tfile:%s",
				tv.tv_sec, tv.tv_usec, path)));
#endif

	/*
	 * If not part of Flush, need to fsync now.  We assume this happens
	 * infrequently enough that it's not a performance issue.
	 */
	if (!fdata)
	{
		if (ctl->do_fsync && pg_fsync(fd))
		{
			slru_errcause = SLRU_FSYNC_FAILED;
			slru_errno = errno;
			close(fd);
			return false;
		}

		if (close(fd))
		{
			slru_errcause = SLRU_CLOSE_FAILED;
			slru_errno = errno;
			return false;
		}
	}

	return true;
}
Beispiel #17
0
/*
 *	RecordTransactionCommitPrepared
 *
 * This is basically the same as RecordTransactionCommit: in particular,
 * we must set the inCommit flag to avoid a race condition.
 *
 * We know the transaction made at least one XLOG entry (its PREPARE),
 * so it is never possible to optimize out the commit record.
 */
static void
RecordTransactionCommitPrepared(TransactionId xid,
								int nchildren,
								TransactionId *children,
								int nrels,
								RelFileNode *rels)
{
	XLogRecData rdata[3];
	int			lastrdata = 0;
	xl_xact_commit_prepared xlrec;
	XLogRecPtr	recptr;

	START_CRIT_SECTION();

	/* See notes in RecordTransactionCommit */
	MyProc->inCommit = true;

	/* Emit the XLOG commit record */
	xlrec.xid = xid;
	xlrec.crec.xact_time = GetCurrentTimestamp();
	xlrec.crec.nrels = nrels;
	xlrec.crec.nsubxacts = nchildren;
	rdata[0].data = (char *) (&xlrec);
	rdata[0].len = MinSizeOfXactCommitPrepared;
	rdata[0].buffer = InvalidBuffer;
	/* dump rels to delete */
	if (nrels > 0)
	{
		rdata[0].next = &(rdata[1]);
		rdata[1].data = (char *) rels;
		rdata[1].len = nrels * sizeof(RelFileNode);
		rdata[1].buffer = InvalidBuffer;
		lastrdata = 1;
	}
	/* dump committed child Xids */
	if (nchildren > 0)
	{
		rdata[lastrdata].next = &(rdata[2]);
		rdata[2].data = (char *) children;
		rdata[2].len = nchildren * sizeof(TransactionId);
		rdata[2].buffer = InvalidBuffer;
		lastrdata = 2;
	}
	rdata[lastrdata].next = NULL;

	recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_PREPARED, rdata);

	/*
	 * We don't currently try to sleep before flush here ... nor is there any
	 * support for async commit of a prepared xact (the very idea is probably
	 * a contradiction)
	 */

	/* Flush XLOG to disk */
	XLogFlush(recptr);

	/* Mark the transaction committed in pg_clog */
	TransactionIdCommitTree(xid, nchildren, children);

	/* Checkpoint can proceed now */
	MyProc->inCommit = false;

	END_CRIT_SECTION();
}
static void PersistentStore_DoInsertTuple(
	PersistentStoreData 		*storeData,

	PersistentStoreSharedData 	*storeSharedData,

	Relation				persistentRel,
				/* The persistent table relation. */

	Datum					*values,

	bool					flushToXLog,
				/* When true, the XLOG record for this change will be flushed to disk. */

	ItemPointer 			persistentTid)
				/* TID of the stored tuple. */

{
	bool 		*nulls;
	HeapTuple	persistentTuple = NULL;
	XLogRecPtr	xlogInsertEndLoc;

	/*
	 * In order to keep the tuples the exact same size to enable direct reuse of
	 * free tuples, we do not use NULLs.
	 */
	nulls = (bool*)palloc0(storeData->numAttributes * sizeof(bool));
		
	/*
	 * Form the tuple.
	 */
	persistentTuple = heap_form_tuple(persistentRel->rd_att, values, nulls);
	if (!HeapTupleIsValid(persistentTuple))
		elog(ERROR, "Failed to build persistent tuple ('%s')",
		     storeData->tableName);

	/*
	 * (We have an exclusive lock (higher up) here so we can direct the insert to the last page.)
	 */
	{
		// Do not assert valid ItemPointer -- it is ok if it is (0,0)...
		BlockNumber blockNumber = 
						BlockIdGetBlockNumber(
								&storeSharedData->maxTid.ip_blkid);
		
		frozen_heap_insert_directed(
							persistentRel, 
							persistentTuple,
							blockNumber);
	}

	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_DoInsertTuple: old maximum known TID %s, new insert TID %s ('%s')",
			 ItemPointerToString(&storeSharedData->maxTid),
			 ItemPointerToString2(&persistentTuple->t_self),
			 storeData->tableName);
	if (ItemPointerCompare(
						&storeSharedData->maxTid,
						&persistentTuple->t_self) == -1)		
	{
		// Current max is Less-Than.
		storeSharedData->maxTid = persistentTuple->t_self;
	}
	
	/*
	 * Return the TID of the INSERT tuple.
	 * Return the XLOG location of the INSERT tuple's XLOG record.
	 */
	*persistentTid = persistentTuple->t_self;
		
	xlogInsertEndLoc = XLogLastInsertEndLoc();

	heap_freetuple(persistentTuple);

	if (flushToXLog)
	{
		XLogFlush(xlogInsertEndLoc);
		XLogRecPtr_Zero(&nowaitXLogEndLoc);
	}
	else
		nowaitXLogEndLoc = xlogInsertEndLoc;

	pfree(nulls);

}
Beispiel #19
0
/*
 * Finish preparing state file.
 *
 * Calculates CRC and writes state file to WAL and in pg_twophase directory.
 */
void
EndPrepare(GlobalTransaction gxact)
{
	TransactionId xid = gxact->proc.xid;
	TwoPhaseFileHeader *hdr;
	char		path[MAXPGPATH];
	XLogRecData *record;
	pg_crc32	statefile_crc;
	pg_crc32	bogus_crc;
	int			fd;

	/* Add the end sentinel to the list of 2PC records */
	RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0,
						   NULL, 0);

	/* Go back and fill in total_len in the file header record */
	hdr = (TwoPhaseFileHeader *) records.head->data;
	Assert(hdr->magic == TWOPHASE_MAGIC);
	hdr->total_len = records.total_len + sizeof(pg_crc32);

	/*
	 * If the file size exceeds MaxAllocSize, we won't be able to read it in
	 * ReadTwoPhaseFile. Check for that now, rather than fail at commit time.
	 */
	if (hdr->total_len > MaxAllocSize)
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("two-phase state file maximum length exceeded")));

	/*
	 * Create the 2PC state file.
	 *
	 * Note: because we use BasicOpenFile(), we are responsible for ensuring
	 * the FD gets closed in any error exit path.  Once we get into the
	 * critical section, though, it doesn't matter since any failure causes
	 * PANIC anyway.
	 */
	TwoPhaseFilePath(path, xid);

	fd = BasicOpenFile(path,
					   O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create two-phase state file \"%s\": %m",
						path)));

	/* Write data to file, and calculate CRC as we pass over it */
	INIT_CRC32(statefile_crc);

	for (record = records.head; record != NULL; record = record->next)
	{
		COMP_CRC32(statefile_crc, record->data, record->len);
		if ((write(fd, record->data, record->len)) != record->len)
		{
			close(fd);
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not write two-phase state file: %m")));
		}
	}

	FIN_CRC32(statefile_crc);

	/*
	 * Write a deliberately bogus CRC to the state file; this is just paranoia
	 * to catch the case where four more bytes will run us out of disk space.
	 */
	bogus_crc = ~statefile_crc;

	if ((write(fd, &bogus_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
	{
		close(fd);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write two-phase state file: %m")));
	}

	/* Back up to prepare for rewriting the CRC */
	if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0)
	{
		close(fd);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not seek in two-phase state file: %m")));
	}

	/*
	 * The state file isn't valid yet, because we haven't written the correct
	 * CRC yet.  Before we do that, insert entry in WAL and flush it to disk.
	 *
	 * Between the time we have written the WAL entry and the time we write
	 * out the correct state file CRC, we have an inconsistency: the xact is
	 * prepared according to WAL but not according to our on-disk state. We
	 * use a critical section to force a PANIC if we are unable to complete
	 * the write --- then, WAL replay should repair the inconsistency.	The
	 * odds of a PANIC actually occurring should be very tiny given that we
	 * were able to write the bogus CRC above.
	 *
	 * We have to set inCommit here, too; otherwise a checkpoint starting
	 * immediately after the WAL record is inserted could complete without
	 * fsync'ing our state file.  (This is essentially the same kind of race
	 * condition as the COMMIT-to-clog-write case that RecordTransactionCommit
	 * uses inCommit for; see notes there.)
	 *
	 * We save the PREPARE record's location in the gxact for later use by
	 * CheckPointTwoPhase.
	 */
	START_CRIT_SECTION();

	MyProc->inCommit = true;

	gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE,
									records.head);
	XLogFlush(gxact->prepare_lsn);

	/* If we crash now, we have prepared: WAL replay will fix things */

	/* write correct CRC and close file */
	if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
	{
		close(fd);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write two-phase state file: %m")));
	}

	if (close(fd) != 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not close two-phase state file: %m")));

	/*
	 * Mark the prepared transaction as valid.	As soon as xact.c marks MyProc
	 * as not running our XID (which it will do immediately after this
	 * function returns), others can commit/rollback the xact.
	 *
	 * NB: a side effect of this is to make a dummy ProcArray entry for the
	 * prepared XID.  This must happen before we clear the XID from MyProc,
	 * else there is a window where the XID is not running according to
	 * TransactionIdIsInProgress, and onlookers would be entitled to assume
	 * the xact crashed.  Instead we have a window where the same XID appears
	 * twice in ProcArray, which is OK.
	 */
	MarkAsPrepared(gxact);

	/*
	 * Now we can mark ourselves as out of the commit critical section: a
	 * checkpoint starting after this will certainly see the gxact as a
	 * candidate for fsyncing.
	 */
	MyProc->inCommit = false;

	END_CRIT_SECTION();

	records.tail = records.head = NULL;
}
void PersistentStore_UpdateTuple(
	PersistentStoreData 		*storeData,

	PersistentStoreSharedData 	*storeSharedData,

	ItemPointer 			persistentTid,
				/* TID of the stored tuple. */

	Datum					*values,

	bool					flushToXLog)
				/* When true, the XLOG record for this change will be flushed to disk. */

{
	Relation	persistentRel;
	bool 		*nulls;
	HeapTuple	persistentTuple = NULL;
	XLogRecPtr 	xlogUpdateEndLoc;
	
#ifdef USE_ASSERT_CHECKING
	if (storeSharedData == NULL ||
		!PersistentStoreSharedData_EyecatcherIsValid(storeSharedData))
		elog(ERROR, "Persistent store shared-memory not valid");
#endif
	
	if (Debug_persistent_store_print)
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_ReplaceTuple: Going to update whole tuple at TID %s ('%s', shared data %p)",
			 ItemPointerToString(persistentTid),
			 storeData->tableName,
			 storeSharedData);

	persistentRel = (*storeData->openRel)();

	/*
	 * In order to keep the tuples the exact same size to enable direct reuse of
	 * free tuples, we do not use NULLs.
	 */
	nulls = (bool*)palloc0(storeData->numAttributes * sizeof(bool));
		
	/*
	 * Form the tuple.
	 */
	persistentTuple = heap_form_tuple(persistentRel->rd_att, values, nulls);
	if (!HeapTupleIsValid(persistentTuple))
		elog(ERROR, "Failed to build persistent tuple ('%s')",
		     storeData->tableName);

	persistentTuple->t_self = *persistentTid;

	frozen_heap_inplace_update(persistentRel, persistentTuple);

	/*
	 * Return the XLOG location of the UPDATE tuple's XLOG record.
	 */
	xlogUpdateEndLoc = XLogLastInsertEndLoc();

	heap_freetuple(persistentTuple);

#ifdef FAULT_INJECTOR
	if (FaultInjector_InjectFaultIfSet(SyncPersistentTable,
										DDLNotSpecified,
										"" /* databaseName */,
										"" /* tableName */)== FaultInjectorTypeSkip)
	{
		FlushRelationBuffers(persistentRel);
		smgrimmedsync(persistentRel->rd_smgr);
	}
#endif

	(*storeData->closeRel)(persistentRel);
	
	if (Debug_persistent_store_print)
	{
		elog(PersistentStore_DebugPrintLevel(), 
			 "PersistentStore_UpdateTuple: Updated whole tuple at TID %s ('%s')",
			 ItemPointerToString(persistentTid),
			 storeData->tableName);

		(*storeData->printTupleCallback)(
									PersistentStore_DebugPrintLevel(),
									"STORE UPDATED TUPLE",
									persistentTid,
									values);
	}

	if (flushToXLog)
	{
		XLogFlush(xlogUpdateEndLoc);
		XLogRecPtr_Zero(&nowaitXLogEndLoc);
	}
	else
		nowaitXLogEndLoc = xlogUpdateEndLoc;
}
Beispiel #21
0
/*
 * Execute the barrier command on all the components, including Datanodes and
 * Coordinators.
 */
static void
ExecuteBarrier(const char *id)
{
	List *barrierDataNodeList = GetAllDataNodes();
	List *barrierCoordList = GetAllCoordNodes();
	PGXCNodeAllHandles *conn_handles;
	int conn;
	int msglen;
	int barrier_idlen;

	conn_handles = get_handles(barrierDataNodeList, barrierCoordList, false, true);

	elog(DEBUG2, "Sending CREATE BARRIER <%s> EXECUTE message to "
				 "Datanodes and Coordinator", id);
	/*
	 * Send a CREATE BARRIER request to all the Datanodes and the Coordinators
	 */
	for (conn = 0; conn < conn_handles->co_conn_count + conn_handles->dn_conn_count; conn++)
	{
		PGXCNodeHandle *handle;

		if (conn < conn_handles->co_conn_count)
			handle = conn_handles->coord_handles[conn];
		else
			handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count];

		/* Invalid connection state, return error */
		if (handle->state != DN_CONNECTION_STATE_IDLE)
			ereport(ERROR,
					(errcode(ERRCODE_INTERNAL_ERROR),
					 errmsg("Failed to send CREATE BARRIER EXECUTE request "
						 	"to the node")));

		barrier_idlen = strlen(id) + 1;

		msglen = 4; /* for the length itself */
		msglen += barrier_idlen;
		msglen += 1; /* for barrier command itself */

		/* msgType + msgLen */
		if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
		{
			ereport(ERROR,
					(errcode(ERRCODE_INTERNAL_ERROR),
					 errmsg("Out of memory")));
		}

		handle->outBuffer[handle->outEnd++] = 'b';
		msglen = htonl(msglen);
		memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
		handle->outEnd += 4;

		handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_EXECUTE;

		memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen);
		handle->outEnd += barrier_idlen;

		handle->state = DN_CONNECTION_STATE_QUERY;
		pgxc_node_flush(handle);
	}

	CheckBarrierCommandStatus(conn_handles, id, "EXECUTE");

	pfree_pgxc_all_handles(conn_handles);

	/*
	 * Also WAL log the BARRIER locally and flush the WAL buffers to disk
	 */
	{
		XLogRecData rdata[1];
		XLogRecPtr recptr;

		rdata[0].data = (char *) id;
		rdata[0].len = strlen(id) + 1;
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = NULL;

		recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE, rdata);
		XLogFlush(recptr);
	}
}
Beispiel #22
0
/**
 * @brief Write block buffer contents.	Number of block buffer to be
 * written is specified by num argument.
 *
 * Flow:
 * <ol>
 *	 <li>If no more space is available in the data file, switch to a new one.</li>
 *	 <li>Compute block number which can be written to the current file.</li>
 *	 <li>Save the last block number in the load status file.</li>
 *	 <li>Write to the current file.</li>
 *	 <li>If there are other data, write them too.</li>
 * </ol>
 *
 * @param loader [in] Direct Writer.
 * @return File descriptor for the current data file.
 */
static void
flush_pages(DirectWriter *loader)
{
	int			i;
	int			num;
	LoadStatus *ls = &loader->ls;

	num = loader->curblk;
	if (!PageIsEmpty(GetCurrentPage(loader)))
		num += 1;

	if (num <= 0)
		return;		/* no work */

	/*
	 * Add WAL entry (only the first page) to ensure the current xid will
	 * be recorded in xlog. We must flush some xlog records with XLogFlush()
	 * before write any data blocks to follow the WAL protocol.
	 *
	 * If postgres process, such as loader and COPY, is killed by "kill -9",
	 * database will be rewound to the last checkpoint and recovery will
	 * be performed using WAL.
	 *
	 * After the recovery, if there are xid's which have not been recorded
	 * to WAL, such xid's will be reused.
	 *
	 * However, in the loader and COPY, data file is actually updated and
	 * xid must not be reused.
	 *
	 * WAL entry with such xid can be added using XLogInsert().  However,
	 * such entries are not really written to the disk immediately.
	 * WAL entries are flushed to the disk by XLogFlush(), typically
	 * when a transaction is commited.	COPY prevents xid reuse by
	 * this method.
	 */
#if PG_VERSION_NUM >= 90100
	if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel)
			&& !(loader->base.rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) )
	{
		XLogRecPtr	recptr;

		recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM,
			ls->ls.exist_cnt, loader->blocks);
		XLogFlush(recptr);
	}
#else
	if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel) )
	{
		XLogRecPtr	recptr;

		recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM,
			ls->ls.exist_cnt, loader->blocks);
		XLogFlush(recptr);
	}
#endif
	/*
	 * Write blocks. We might need to write multiple files on boundary of
	 * relation segments.
	 */
	for (i = 0; i < num;)
	{
		char	   *buffer;
		int			total;
		int			written;
		int			flush_num;
		BlockNumber	relblks = LS_TOTAL_CNT(ls);

		/* Switch to the next file if the current file has been filled up. */
		if (relblks % RELSEG_SIZE == 0)
			close_data_file(loader);
		if (loader->datafd == -1)
			loader->datafd = open_data_file(ls->ls.rnode,
											RELATION_IS_LOCAL(loader->base.rel),
											relblks);

		/* Number of blocks to be added to the current file. */
		flush_num = Min(num - i, RELSEG_SIZE - relblks % RELSEG_SIZE);
		Assert(flush_num > 0);

		/* Write the last block number to the load status file. */
		UpdateLSF(loader, flush_num);

#if PG_VERSION_NUM >= 90300
		/* If we need a checksum, add it */
	        if (DataChecksumsEnabled()){
        		int j = 0;
			Page contained_page;
	        	for (  j=0; j<flush_num; j++ ) {
                		contained_page = GetTargetPage(loader,j);
		                ((PageHeader) contained_page)->pd_checksum = 
					pg_checksum_page((char *) contained_page, LS_TOTAL_CNT(ls) - 1 - j);
        		}
		}	
#endif

		/*
		 * Flush flush_num data block to the current file.
		 * Then the current file size becomes RELSEG_SIZE self->blocks.
		 */
		buffer = loader->blocks + BLCKSZ * i;
		total = BLCKSZ * flush_num;
		written = 0;
		while (total > 0)
		{
			int	len = write(loader->datafd, buffer + written, total);
			if (len == -1)
			{
				/* fatal error, do not want to write blocks anymore */
				ereport(ERROR, (errcode_for_file_access(),
								errmsg("could not write to data file: %m")));
			}
			written += len;
			total -= len;
		}

		i += flush_num;
	}

	/*
	 * NOTICE: Be sure reset curblk to 0 and reinitialize recycled page
	 * if you will continue to use blocks.
	 */
}