コード例 #1
0
ファイル: spgxlog.c プロジェクト: avontd2868/postgres
static void
spgRedoVacuumRoot(XLogRecPtr lsn, XLogRecord *record)
{
	char	   *ptr = XLogRecGetData(record);
	spgxlogVacuumRoot *xldata = (spgxlogVacuumRoot *) ptr;
	OffsetNumber *toDelete;
	Buffer		buffer;
	Page		page;

	ptr += sizeof(spgxlogVacuumRoot);
	toDelete = (OffsetNumber *) ptr;

	if (!(record->xl_info & XLR_BKP_BLOCK_1))
	{
		buffer = XLogReadBuffer(xldata->node, SPGIST_HEAD_BLKNO, false);
		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);
			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				/* The tuple numbers are in order */
				PageIndexMultiDelete(page, toDelete, xldata->nDelete);

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}
}
コード例 #2
0
/*
 * Returns the replication apply delay in ms
 */
int
GetReplicationApplyDelay(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile WalRcvData *walrcv = WalRcv;

	XLogRecPtr	receivePtr;
	XLogRecPtr	replayPtr;

	long		secs;
	int			usecs;

	SpinLockAcquire(&walrcv->mutex);
	receivePtr = walrcv->receivedUpto;
	SpinLockRelease(&walrcv->mutex);

	replayPtr = GetXLogReplayRecPtr(NULL);

	if (XLByteLE(receivePtr, replayPtr))
		return 0;

	TimestampDifference(GetCurrentChunkReplayStartTime(),
						GetCurrentTimestamp(),
						&secs, &usecs);

	return (((int) secs * 1000) + (usecs / 1000));
}
コード例 #3
0
ファイル: walsender.c プロジェクト: ibejoeb/postgres
/*
 * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
 * but not yet sent to the client, and buffer it in the libpq output
 * buffer.
 *
 * msgbuf is a work area in which the output message is constructed.  It's
 * passed in just so we can avoid re-palloc'ing the buffer on each cycle.
 * It must be of size 1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE.
 *
 * If there is no unsent WAL remaining, *caughtup is set to true, otherwise
 * *caughtup is set to false.

 */
static void
XLogSend(char *msgbuf, bool *caughtup)
{
	XLogRecPtr	SendRqstPtr;
	XLogRecPtr	startptr;
	XLogRecPtr	endptr;
	Size		nbytes;
	WalDataMessageHeader msghdr;

	/*
	 * Attempt to send all data that's already been written out and fsync'd to
	 * disk.  We cannot go further than what's been written out given the
	 * current implementation of XLogRead().  And in any case it's unsafe to
	 * send WAL that is not securely down to disk on the master: if the master
	 * subsequently crashes and restarts, slaves must not have applied any WAL
	 * that gets lost on the master.
	 */
	SendRqstPtr = am_cascading_walsender ? GetStandbyFlushRecPtr() : GetFlushRecPtr();

	/* Quick exit if nothing to do */
	if (XLByteLE(SendRqstPtr, sentPtr))
	{
		*caughtup = true;
		return;
	}

	/*
	 * Figure out how much to send in one message. If there's no more than
	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
	 *
	 * The rounding is not only for performance reasons. Walreceiver relies on
	 * the fact that we never split a WAL record across two messages. Since a
	 * long WAL record is split at page boundary into continuation records,
	 * page boundary is always a safe cut-off point. We also assume that
	 * SendRqstPtr never points to the middle of a WAL record.
	 */
	startptr = sentPtr;
	if (startptr.xrecoff >= XLogFileSize)
	{
		/*
		 * crossing a logid boundary, skip the non-existent last log segment
		 * in previous logical log file.
		 */
		startptr.xlogid += 1;
		startptr.xrecoff = 0;
	}

	endptr = startptr;
	XLByteAdvance(endptr, MAX_SEND_SIZE);
	if (endptr.xlogid != startptr.xlogid)
	{
		/* Don't cross a logfile boundary within one message */
		Assert(endptr.xlogid == startptr.xlogid + 1);
		endptr.xlogid = startptr.xlogid;
		endptr.xrecoff = XLogFileSize;
	}

	/* if we went beyond SendRqstPtr, back off */
	if (XLByteLE(SendRqstPtr, endptr))
	{
		endptr = SendRqstPtr;
		*caughtup = true;
	}
	else
	{
		/* round down to page boundary. */
		endptr.xrecoff -= (endptr.xrecoff % XLOG_BLCKSZ);
		*caughtup = false;
	}

	nbytes = endptr.xrecoff - startptr.xrecoff;
	Assert(nbytes <= MAX_SEND_SIZE);

	/*
	 * OK to read and send the slice.
	 */
	msgbuf[0] = 'w';

	/*
	 * Read the log directly into the output buffer to avoid extra memcpy
	 * calls.
	 */
	XLogRead(msgbuf + 1 + sizeof(WalDataMessageHeader), startptr, nbytes);

	/*
	 * We fill the message header last so that the send timestamp is taken as
	 * late as possible.
	 */
	msghdr.dataStart = startptr;
	msghdr.walEnd = SendRqstPtr;
	msghdr.sendTime = GetCurrentTimestamp();

	memcpy(msgbuf + 1, &msghdr, sizeof(WalDataMessageHeader));

	pq_putmessage_noblock('d', msgbuf, 1 + sizeof(WalDataMessageHeader) + nbytes);

	sentPtr = endptr;

	/* Update shared memory status */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile WalSnd *walsnd = MyWalSnd;

		SpinLockAcquire(&walsnd->mutex);
		walsnd->sentPtr = sentPtr;
		SpinLockRelease(&walsnd->mutex);
	}

	/* Report progress of XLOG streaming in PS display */
	if (update_process_title)
	{
		char		activitymsg[50];

		snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
				 sentPtr.xlogid, sentPtr.xrecoff);
		set_ps_display(activitymsg, false);
	}

	return;
}
コード例 #4
0
ファイル: cdbfilerepresyncworker.c プロジェクト: AnLingm/gpdb
static int
FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request)
{
	int				status = STATUS_OK;
	Page			page;
	Buffer			buf; 
	BlockNumber		numBlocks = 0;
	SMgrRelation	smgr_relation = NULL;
	char			relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1];
	int				ii;
	XLogRecPtr		loc;
	XLogRecPtr		loc1;
	int				count = 0;
	int				thresholdCount = 0;
	bool			mirrorDataLossOccurred = FALSE;
	int				NumberOfRelations = request->count;
	
	FileRepResyncHashEntry_s	entry;
	ChangeTrackingResult		*result = NULL;	

	while (1)
	{
		/* allow flushing buffers from buffer pool during scan */
		FileRepResync_SetReadBufferRequest();
		if ((result = ChangeTracking_GetChanges(request)) != NULL) 
		{
			FileRepResync_ResetReadBufferRequest();
					
			for (ii = 0; ii < result->count; ii++)
			{
				
				if (smgr_relation == NULL)
				{
					NumberOfRelations--;
					
					smgr_relation = smgropen(result->entries[ii].relFileNode);
					
					snprintf(relidstr, sizeof(relidstr), "%u/%u/%u",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode);

					numBlocks = smgrnblocks(smgr_relation);
					
					if (Debug_filerep_print)
						elog(LOG, "resynchronize buffer pool relation '%u/%u/%u' "
							 "number of blocks:'%u' ",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode,
							 numBlocks);
					
					thresholdCount = Min(numBlocks, 1024);
				}
				
				loc1 =  result->entries[ii].lsn_end;
				
				/*
				 * if relation was truncated then block_num from change tracking can be beyond numBlocks 
				 */
				if (result->entries[ii].block_num >=  numBlocks)
				{
					ereport(LOG,	
							(errmsg("could not resynchonize buffer pool relation '%s' block '%d' (maybe due to truncate), "
									"lsn change tracking '%s(%u/%u)' "
									"number of blocks '%d' ",
									relidstr,
									result->entries[ii].block_num,
									XLogLocationToString(&loc1),
									loc1.xlogid,
									loc1.xrecoff,
									numBlocks),						
							 FileRep_errcontext()));						
					
					goto flush_check;
				}
				
				/* allow flushing buffers from buffer pool during scan */
				FileRepResync_SetReadBufferRequest();
				buf = ReadBuffer_Resync(smgr_relation,
										result->entries[ii].block_num,
										relidstr);
				FileRepResync_ResetReadBufferRequest();
				
				Assert(result->entries[ii].block_num < numBlocks);
				
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				page = BufferGetPage(buf);
				
				loc = PageGetLSN(page); 
				
				if(Debug_filerep_print)
				{
					elog(LOG,	
							"incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' "
							"lsn end change tracking '%s(%u/%u)' ",
							relidstr,
							numBlocks,
							result->entries[ii].block_num,
							XLogLocationToString(&loc),
							loc.xlogid,
							loc.xrecoff,
							XLogLocationToString(&loc1),
							result->entries[ii].lsn_end.xlogid,
							result->entries[ii].lsn_end.xrecoff);					
				}
				else
				{
					char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];
					
					snprintf(tmpBuf, sizeof(tmpBuf), 
							 "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' ",
							 relidstr,
							 numBlocks,
							 result->entries[ii].block_num,
							 XLogLocationToString(&loc),
							 loc.xlogid,
							 loc.xrecoff);
					
					FileRep_InsertConfigLogEntry(tmpBuf);
					
					snprintf(tmpBuf, sizeof(tmpBuf), 
							 "incremental resync buffer pool identifier '%s' lsn end change tracking '%s(%u/%u)' ",
							 relidstr,
							 XLogLocationToString(&loc1),
							 result->entries[ii].lsn_end.xlogid,
							 result->entries[ii].lsn_end.xrecoff);
					
					FileRep_InsertConfigLogEntry(tmpBuf);
					
				}
								
				if (XLByteLE(result->entries[ii].lsn_end, PageGetLSN(page)))
				{
					if (! XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end))
					{
						ereport(LOG,
							(errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn less than CT lsn, "
								"lsn end change tracking '%s(%u/%u)' lsn page '%s(%u/%u)' "
								"number of blocks '%d'",
								relidstr,
								result->entries[ii].block_num,
								XLogLocationToString(&loc),
								loc.xlogid,
								loc.xrecoff,
								XLogLocationToString(&loc1),
								loc1.xlogid,
								loc1.xrecoff,
								numBlocks),
							 FileRep_errcontext()));

					}

					/*
					 * It's safe and better to perform write of the page to mirror,
					 * for this case, as primary and mirror data pages should always
					 * be same. So, we might do some extra work but definitely won't
					 * loose out blocks, or error out and need to perform full recovery.
					 * Need to cover for this case as there are some known scenarios where
					 * CT file can have extra records which should have been discarded,
					 * but as we loose out information of xlog LSN cannot be discarded.
					 * One such case is when CT_TRANSIENT being compacted to CT_COMPACT
					 * with specific xlog LSN (to discard extra records) in CT mode gets
					 * interrupted by resync. Compaction during Resync collects all the
					 * CT records and doesn't have xlog LSN information to discard any
					 * extra records from CT_TRANSIENT.
					 */

					smgrwrite(smgr_relation,
							  result->entries[ii].block_num,
							  (char *)BufferGetBlock(buf),
							  FALSE);
				}

#ifdef FAULT_INJECTOR	
				FaultInjector_InjectFaultIfSet(
											   FileRepResyncWorker, 
											   DDLNotSpecified,
											   "",	// databaseName
											   ""); // tableName
#endif				
				
				UnlockReleaseBuffer(buf);
				
#ifdef FAULT_INJECTOR	
				FaultInjector_InjectFaultIfSet(
											   FileRepResyncWorker, 
											   DDLNotSpecified,
											   "",	// databaseName
											   ""); // tableName
#endif				
		
	flush_check:			
				if (((ii + 1) == result->count) ||
					! (result->entries[ii].relFileNode.spcNode == result->entries[ii+1].relFileNode.spcNode &&
					   result->entries[ii].relFileNode.dbNode == result->entries[ii+1].relFileNode.dbNode &&
					   result->entries[ii].relFileNode.relNode == result->entries[ii+1].relFileNode.relNode))
				{
					if (result->ask_for_more == false)
					{
								
						smgrimmedsync(smgr_relation);
						
						smgrclose(smgr_relation);
								 
						smgr_relation = NULL;
							
						FileRep_GetRelationPath(
												 entry.fileName, 
												 result->entries[ii].relFileNode, 
												 0 /* segment file number is always 0 for Buffer Pool */);							 
								 
						status = FileRepResync_UpdateEntry(&entry);
						if (status != STATUS_OK)
						{
							 break;
						}
					}
								 
				}			
							
				if (count > thresholdCount)
				{
					count = 0;
					FileRepSubProcess_ProcessSignals();
					
					if (! (FileRepSubProcess_GetState() == FileRepStateReady && 
						   dataState == DataStateInResync))
					{
						mirrorDataLossOccurred = TRUE;
						break;
					}
				}
				else
					count++;
			}  // for (ii = 0; ii < result->count; ii++)
			
		} // if ((result = ChangeTracking_GetChanges(request)) != NULL) 
		
		FileRepResync_ResetReadBufferRequest();
			
		if (result != NULL && result->ask_for_more == true)
		{
			Assert(request->count == 1);
			request->entries[0].lsn_start = result->next_start_lsn;
		}
		else
		{
			break;
		}

	} // while(1) 
		
	ChangeTracking_FreeRequest(request);
	ChangeTracking_FreeResult(result);
	
	Insist(NumberOfRelations == 0);
	
	if (mirrorDataLossOccurred)
		status = STATUS_ERROR;
	
	return status;	
}
コード例 #5
0
ファイル: cdbfilerepresyncworker.c プロジェクト: AnLingm/gpdb
static int
FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s	*entry)
{

	int				status = STATUS_OK;
	Page			page;
	Buffer			buf; 
	BlockNumber		numBlocks;
	BlockNumber		blkno;
	SMgrRelation	smgr_relation;
	char			relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1];
	XLogRecPtr		loc;
	int				count = 0;
	int				thresholdCount = 0;
	bool			mirrorDataLossOccurred = FALSE;
		
	switch (entry->relStorageMgr)
	{

		case PersistentFileSysRelStorageMgr_BufferPool:
			
			switch (entry->mirrorDataSynchronizationState)
			{
				case MirroredRelDataSynchronizationState_BufferPoolScanIncremental:
				case MirroredRelDataSynchronizationState_FullCopy:

					smgr_relation = smgropen(entry->relFileNode);
					
					numBlocks = smgrnblocks(smgr_relation);

					snprintf(relidstr, sizeof(relidstr), "%u/%u/%u",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode);

					if (Debug_filerep_print)
						elog(LOG, "resync buffer pool relation '%s' number of blocks '%d' ",
							 relidstr, numBlocks);

					thresholdCount = Min(numBlocks, 1024);
					
					/* 
					 * required in order to report how many blocks were synchronized 
					 * if gp_persistent_relation_node does not return that information 
					 */
					if (entry->mirrorBufpoolResyncChangedPageCount == 0)
					{
						entry->mirrorBufpoolResyncChangedPageCount = numBlocks - entry->mirrorBufpoolResyncCkptBlockNum;
					}
					
					for (blkno = entry->mirrorBufpoolResyncCkptBlockNum; blkno < numBlocks; blkno++) 
					{
						XLogRecPtr	endResyncLSN = (isFullResync() ? 
													FileRepResync_GetEndFullResyncLSN() :
													FileRepResync_GetEndIncrResyncLSN());
#ifdef FAULT_INJECTOR
						FaultInjector_InjectFaultIfSet(
													   FileRepResyncWorkerRead,
													   DDLNotSpecified,
													   "",	//databaseName
													   ""); // tableName
#endif				
						
						FileRepResync_SetReadBufferRequest();
						buf = ReadBuffer_Resync(smgr_relation, blkno, relidstr);
						FileRepResync_ResetReadBufferRequest();
						
						LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
						page = BufferGetPage(buf);
						
						loc = PageGetLSN(page);
						
						if (Debug_filerep_print)
						{
							elog(LOG, 
									 "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' "
									 "lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ",
									 relidstr,
									 numBlocks,
									 blkno,
									 XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc),
									 entry->mirrorBufpoolResyncCkptLoc.xlogid,
									 entry->mirrorBufpoolResyncCkptLoc.xrecoff,
									 XLogLocationToString(&loc),
									 loc.xlogid,
									 loc.xrecoff,
									 XLogLocationToString(&endResyncLSN),
									 endResyncLSN.xlogid,
									 endResyncLSN.xrecoff);
						}
						else
						{
							char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];
							
							snprintf(tmpBuf, sizeof(tmpBuf), 
									 "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' ",
									 relidstr,
									 numBlocks,
									 blkno,
									 XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc),
									 entry->mirrorBufpoolResyncCkptLoc.xlogid,
									 entry->mirrorBufpoolResyncCkptLoc.xrecoff);
														
							FileRep_InsertConfigLogEntry(tmpBuf);
							
							snprintf(tmpBuf, sizeof(tmpBuf), 
									 "full resync buffer pool identifier '%s' lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ",
									 relidstr,
									 XLogLocationToString(&loc),
									 loc.xlogid,
									 loc.xrecoff,
									 XLogLocationToString(&endResyncLSN),
									 endResyncLSN.xlogid,
									 endResyncLSN.xrecoff);
							
							FileRep_InsertConfigLogEntry(tmpBuf);
							
						}
						
						if (XLByteLE(PageGetLSN(page), endResyncLSN) &&
							XLByteLE(entry->mirrorBufpoolResyncCkptLoc, PageGetLSN(page))) 
						{
							smgrwrite(smgr_relation, 
									  blkno,
									  (char *)BufferGetBlock(buf),
									  FALSE);
						}
						
#ifdef FAULT_INJECTOR	
						FaultInjector_InjectFaultIfSet(
													   FileRepResyncWorker, 
													   DDLNotSpecified,
													   "",	// databaseName
													   ""); // tableName
#endif				
						
						UnlockReleaseBuffer(buf);
						
						if (count > thresholdCount)
						{
							count = 0;
							FileRepSubProcess_ProcessSignals();
							
							if (! (FileRepSubProcess_GetState() == FileRepStateReady && 
								   dataState == DataStateInResync))
							{
								mirrorDataLossOccurred = TRUE;
								break;
							}
						}
						else
							count++;
					}
						
					if (mirrorDataLossOccurred)
						break;

					if (entry->mirrorDataSynchronizationState != MirroredRelDataSynchronizationState_FullCopy)
					{
						LockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock);
					
						numBlocks = smgrnblocks(smgr_relation);
					
						smgrtruncate(smgr_relation,
								 numBlocks,
								 TRUE /* isTemp, TRUE means to not record in XLOG */,
								 FALSE /* isLocalBuf */,
								 &entry->persistentTid,
								 entry->persistentSerialNum);
								 
						UnlockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock);
					}
					
					smgrimmedsync(smgr_relation);
					smgrclose(smgr_relation);
					
					smgr_relation = NULL;
					break;
					
				case MirroredRelDataSynchronizationState_None:										
				case MirroredRelDataSynchronizationState_DataSynchronized:
					break;
					
				default:
					ereport(LOG, 
							(errmsg("could not resynchronize relation '%u/%u/%u' "
									"mirror synchronization state:'%s(%d)' ",
									entry->relFileNode.relNode,
									entry->relFileNode.spcNode,
									entry->relFileNode.dbNode,
									MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState),
									entry->mirrorDataSynchronizationState)));
					break;
			}
			break;
			
		case PersistentFileSysRelStorageMgr_AppendOnly:
		{
			MirroredAppendOnlyOpen	mirroredOpen;
			int						primaryError;
			bool					mirrorDataLossOccurred;
			char					*buffer = NULL;
			int64					endOffset = entry->mirrorAppendOnlyNewEof;
			int64					startOffset = entry->mirrorAppendOnlyLossEof;
			int32					bufferLen = 0;
			int						retval = 0;
			
			switch (entry->mirrorDataSynchronizationState)
			{
				case MirroredRelDataSynchronizationState_AppendOnlyCatchup:
				case MirroredRelDataSynchronizationState_FullCopy:
					
					/* 
					 * required in order to report how many blocks were synchronized 
					 * if gp_persistent_relation_node does not return that information 
					 */
					if (entry->mirrorBufpoolResyncChangedPageCount == 0)
					{
						entry->mirrorBufpoolResyncChangedPageCount = (endOffset - startOffset) / BLCKSZ;
					}					
					
					/*
					 * The MirroredAppendOnly_OpenResynchonize routine knows we are a resynch worker and
					 * will open BOTH, but write only the MIRROR!!!
					 */
					MirroredAppendOnly_OpenResynchonize(
											&mirroredOpen, 
											&entry->relFileNode,
											entry->segmentFileNum,
											startOffset,
											&primaryError,
											&mirrorDataLossOccurred);
					if (primaryError != 0)
					{
						ereport(ERROR,
								(errcode_for_file_access(),
								 errmsg("could not open file %u/%u/%u.%u : %s",
										entry->relFileNode.dbNode,
										entry->relFileNode.spcNode,
										entry->relFileNode.relNode,
										entry->segmentFileNum,
										strerror(primaryError))));
						
						break;
					}

					if (mirrorDataLossOccurred)
						break;
					
					/* AO and CO Data Store writes 64k size by default */
					bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset);
					buffer = (char*) palloc(bufferLen);
					if (buffer == NULL)
						ereport(ERROR,
								(errcode(ERRCODE_OUT_OF_MEMORY),
								 (errmsg("not enough memory for resynchronization"))));
					
					MemSet(buffer, 0, bufferLen);
					
					while (startOffset < endOffset)
					{
						retval = MirroredAppendOnly_Read(
												&mirroredOpen,
												buffer,
												bufferLen);
						
						if (retval != bufferLen) 
						{
							ereport(ERROR,
									(errcode_for_file_access(),
									 errmsg("could not read from position:" INT64_FORMAT " in file %u/%u/%u.%u : %m",
											startOffset, 
											entry->relFileNode.dbNode,
											entry->relFileNode.spcNode,
											entry->relFileNode.relNode,
											entry->segmentFileNum)));
							
							break;
						}						
						
						MirroredAppendOnly_Append(
											  &mirroredOpen,
											  buffer,
											  bufferLen,
											  &primaryError,
											  &mirrorDataLossOccurred);
						
						if (mirrorDataLossOccurred)
							break;

						Assert(primaryError == 0);	// No primary writes as resync worker.
						
						startOffset += bufferLen;
						/* AO and CO Data Store writes 64k size by default */
						bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset);						
					}
					
					if (buffer) 
					{
						pfree(buffer);
						buffer = NULL;
					}
					
					if (mirrorDataLossOccurred)
						break;
					
					/* Flush written data on Mirror */
					MirroredAppendOnly_Flush(
										&mirroredOpen,
										&primaryError,
										&mirrorDataLossOccurred);
					if (mirrorDataLossOccurred)
						break;
					
					Assert(primaryError == 0);	// Not flushed on primary as resync worker.
					
					/* Close Primary and Mirror */
					MirroredAppendOnly_Close(
										&mirroredOpen,
										&mirrorDataLossOccurred);
								
					break;
					
				case MirroredRelDataSynchronizationState_None:										
				case MirroredRelDataSynchronizationState_DataSynchronized:
					break;					
					
				default:
					ereport(LOG, 
							(errmsg("could not resynchronize relation '%u/%u/%u' "
									"mirror synchronization state:'%s(%d)' ",
									entry->relFileNode.relNode,
									entry->relFileNode.spcNode,
									entry->relFileNode.dbNode,
									MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState),
									entry->mirrorDataSynchronizationState)));
					break;
			}
			
			break;
		}	//case
		default:
			Assert(0);
			break;
	} //switch
	
	if (mirrorDataLossOccurred)
		status = STATUS_ERROR;
	
	return status;
}
コード例 #6
0
ファイル: twophase.c プロジェクト: Khalefa/VLDB12Demo
/*
 * CheckPointTwoPhase -- handle 2PC component of checkpointing.
 *
 * We must fsync the state file of any GXACT that is valid and has a PREPARE
 * LSN <= the checkpoint's redo horizon.  (If the gxact isn't valid yet or
 * has a later LSN, this checkpoint is not responsible for fsyncing it.)
 *
 * This is deliberately run as late as possible in the checkpoint sequence,
 * because GXACTs ordinarily have short lifespans, and so it is quite
 * possible that GXACTs that were valid at checkpoint start will no longer
 * exist if we wait a little bit.
 *
 * If a GXACT remains valid across multiple checkpoints, it'll be fsynced
 * each time.  This is considered unusual enough that we don't bother to
 * expend any extra code to avoid the redundant fsyncs.  (They should be
 * reasonably cheap anyway, since they won't cause I/O.)
 */
void
CheckPointTwoPhase(XLogRecPtr redo_horizon)
{
	TransactionId *xids;
	int			nxids;
	char		path[MAXPGPATH];
	int			i;

	/*
	 * We don't want to hold the TwoPhaseStateLock while doing I/O, so we grab
	 * it just long enough to make a list of the XIDs that require fsyncing,
	 * and then do the I/O afterwards.
	 *
	 * This approach creates a race condition: someone else could delete a
	 * GXACT between the time we release TwoPhaseStateLock and the time we try
	 * to open its state file.	We handle this by special-casing ENOENT
	 * failures: if we see that, we verify that the GXACT is no longer valid,
	 * and if so ignore the failure.
	 */
	if (max_prepared_xacts <= 0)
		return;					/* nothing to do */

	TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START();

	xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId));
	nxids = 0;

	LWLockAcquire(TwoPhaseStateLock, LW_SHARED);

	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
	{
		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];

		if (gxact->valid &&
			XLByteLE(gxact->prepare_lsn, redo_horizon))
			xids[nxids++] = gxact->proc.xid;
	}

	LWLockRelease(TwoPhaseStateLock);

	for (i = 0; i < nxids; i++)
	{
		TransactionId xid = xids[i];
		int			fd;

		TwoPhaseFilePath(path, xid);

		fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
		if (fd < 0)
		{
			if (errno == ENOENT)
			{
				/* OK if gxact is no longer valid */
				if (!TransactionIdIsPrepared(xid))
					continue;
				/* Restore errno in case it was changed */
				errno = ENOENT;
			}
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not open two-phase state file \"%s\": %m",
							path)));
		}

		if (pg_fsync(fd) != 0)
		{
			close(fd);
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not fsync two-phase state file \"%s\": %m",
							path)));
		}

		if (close(fd) != 0)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not close two-phase state file \"%s\": %m",
							path)));
	}

	pfree(xids);

	TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE();
}
コード例 #7
0
ファイル: syncrep.c プロジェクト: hl0103/pgxc
/*
 * Wait for synchronous replication, if requested by user.
 *
 * Initially backends start in state SYNC_REP_NOT_WAITING and then
 * change that state to SYNC_REP_WAITING before adding ourselves
 * to the wait queue. During SyncRepWakeQueue() a WALSender changes
 * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed.
 * This backend then resets its state to SYNC_REP_NOT_WAITING.
 */
void
SyncRepWaitForLSN(XLogRecPtr XactCommitLSN)
{
	char	   *new_status = NULL;
	const char *old_status;

	/*
	 * Fast exit if user has not requested sync replication, or there are no
	 * sync replication standby names defined. Note that those standbys don't
	 * need to be connected.
	 */
	if (!SyncRepRequested() || !SyncStandbysDefined())
		return;

	Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks)));
	Assert(WalSndCtl != NULL);

	/* Reset the latch before adding ourselves to the queue. */
	ResetLatch(&MyProc->waitLatch);

	LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
	Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING);

	/*
	 * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
	 * set.  See SyncRepUpdateSyncStandbysDefined.
	 *
	 * Also check that the standby hasn't already replied. Unlikely race
	 * condition but we'll be fetching that cache line anyway so its likely to
	 * be a low cost check.
	 */
	if (!WalSndCtl->sync_standbys_defined ||
		XLByteLE(XactCommitLSN, WalSndCtl->lsn))
	{
		LWLockRelease(SyncRepLock);
		return;
	}

	/*
	 * Set our waitLSN so WALSender will know when to wake us, and add
	 * ourselves to the queue.
	 */
	MyProc->waitLSN = XactCommitLSN;
	MyProc->syncRepState = SYNC_REP_WAITING;
	SyncRepQueueInsert();
	Assert(SyncRepQueueIsOrderedByLSN());
	LWLockRelease(SyncRepLock);

	/* Alter ps display to show waiting for sync rep. */
	if (update_process_title)
	{
		int			len;

		old_status = get_ps_display(&len);
		new_status = (char *) palloc(len + 32 + 1);
		memcpy(new_status, old_status, len);
		sprintf(new_status + len, " waiting for %X/%X",
				XactCommitLSN.xlogid, XactCommitLSN.xrecoff);
		set_ps_display(new_status, false);
		new_status[len] = '\0'; /* truncate off " waiting ..." */
	}

	/*
	 * Wait for specified LSN to be confirmed.
	 *
	 * Each proc has its own wait latch, so we perform a normal latch
	 * check/wait loop here.
	 */
	for (;;)
	{
		int			syncRepState;

		/*
		 * Wait on latch for up to 60 seconds. This allows us to check for
		 * postmaster death regularly while waiting. Note that timeout here
		 * does not necessarily release from loop.
		 */
		WaitLatch(&MyProc->waitLatch, 60000000L);

		/* Must reset the latch before testing state. */
		ResetLatch(&MyProc->waitLatch);

		/*
		 * Try checking the state without the lock first.  There's no
		 * guarantee that we'll read the most up-to-date value, so if it looks
		 * like we're still waiting, recheck while holding the lock.  But if
		 * it looks like we're done, we must really be done, because once
		 * walsender changes the state to SYNC_REP_WAIT_COMPLETE, it will
		 * never update it again, so we can't be seeing a stale value in that
		 * case.
		 */
		syncRepState = MyProc->syncRepState;
		if (syncRepState == SYNC_REP_WAITING)
		{
			LWLockAcquire(SyncRepLock, LW_SHARED);
			syncRepState = MyProc->syncRepState;
			LWLockRelease(SyncRepLock);
		}
		if (syncRepState == SYNC_REP_WAIT_COMPLETE)
			break;

		/*
		 * If a wait for synchronous replication is pending, we can neither
		 * acknowledge the commit nor raise ERROR or FATAL.  The latter would
		 * lead the client to believe that that the transaction aborted, which
		 * is not true: it's already committed locally. The former is no good
		 * either: the client has requested synchronous replication, and is
		 * entitled to assume that an acknowledged commit is also replicated,
		 * which may not be true. So in this case we issue a WARNING (which
		 * some clients may be able to interpret) and shut off further output.
		 * We do NOT reset ProcDiePending, so that the process will die after
		 * the commit is cleaned up.
		 */
		if (ProcDiePending)
		{
			ereport(WARNING,
					(errcode(ERRCODE_ADMIN_SHUTDOWN),
					 errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
					 errdetail("The transaction has already committed locally, but may not have been replicated to the standby.")));
			whereToSendOutput = DestNone;
			SyncRepCancelWait();
			break;
		}

		/*
		 * It's unclear what to do if a query cancel interrupt arrives.  We
		 * can't actually abort at this point, but ignoring the interrupt
		 * altogether is not helpful, so we just terminate the wait with a
		 * suitable warning.
		 */
		if (QueryCancelPending)
		{
			QueryCancelPending = false;
			ereport(WARNING,
					(errmsg("canceling wait for synchronous replication due to user request"),
					 errdetail("The transaction has already committed locally, but may not have been replicated to the standby.")));
			SyncRepCancelWait();
			break;
		}

		/*
		 * If the postmaster dies, we'll probably never get an
		 * acknowledgement, because all the wal sender processes will exit. So
		 * just bail out.
		 */
		if (!PostmasterIsAlive(true))
		{
			ProcDiePending = true;
			whereToSendOutput = DestNone;
			SyncRepCancelWait();
			break;
		}
	}

	/*
	 * WalSender has checked our LSN and has removed us from queue. Clean up
	 * state and leave.  It's OK to reset these shared memory fields without
	 * holding SyncRepLock, because any walsenders will ignore us anyway when
	 * we're not on the queue.
	 */
	Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks)));
	MyProc->syncRepState = SYNC_REP_NOT_WAITING;
	MyProc->waitLSN.xlogid = 0;
	MyProc->waitLSN.xrecoff = 0;

	if (new_status)
	{
		/* Reset ps display */
		set_ps_display(new_status, false);
		pfree(new_status);
	}
}
コード例 #8
0
ファイル: gistxlog.c プロジェクト: GisKook/Gis
/*
 * redo any page update (except page split)
 */
static void
gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record)
{
	char	   *begin = XLogRecGetData(record);
	gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin;
	Buffer		buffer;
	Page		page;
	char	   *data;

	if (BlockNumberIsValid(xldata->leftchild))
		gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild);

	/* nothing more to do if page was backed up (and no info to do it with) */
	if (record->xl_info & XLR_BKP_BLOCK_1)
		return;

	buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
	if (!BufferIsValid(buffer))
		return;
	page = (Page) BufferGetPage(buffer);

	if (XLByteLE(lsn, PageGetLSN(page)))
	{
		UnlockReleaseBuffer(buffer);
		return;
	}

	data = begin + sizeof(gistxlogPageUpdate);

	/* Delete old tuples */
	if (xldata->ntodelete > 0)
	{
		int			i;
		OffsetNumber *todelete = (OffsetNumber *) data;

		data += sizeof(OffsetNumber) * xldata->ntodelete;

		for (i = 0; i < xldata->ntodelete; i++)
			PageIndexTupleDelete(page, todelete[i]);
		if (GistPageIsLeaf(page))
			GistMarkTuplesDeleted(page);
	}

	/* add tuples */
	if (data - begin < record->xl_len)
	{
		OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber :
		OffsetNumberNext(PageGetMaxOffsetNumber(page));

		while (data - begin < record->xl_len)
		{
			IndexTuple	itup = (IndexTuple) data;
			Size		sz = IndexTupleSize(itup);
			OffsetNumber l;

			data += sz;

			l = PageAddItem(page, (Item) itup, sz, off, false, false);
			if (l == InvalidOffsetNumber)
				elog(ERROR, "failed to add item to GiST index page, size %d bytes",
					 (int) sz);
			off++;
		}
	}
	else
	{
		/*
		 * special case: leafpage, nothing to insert, nothing to delete, then
		 * vacuum marks page
		 */
		if (GistPageIsLeaf(page) && xldata->ntodelete == 0)
			GistClearTuplesDeleted(page);
	}

	if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO)

		/*
		 * all links on non-leaf root page was deleted by vacuum full, so root
		 * page becomes a leaf
		 */
		GistPageSetLeaf(page);

	GistPageGetOpaque(page)->rightlink = InvalidBlockNumber;
	PageSetLSN(page, lsn);
	PageSetTLI(page, ThisTimeLineID);
	MarkBufferDirty(buffer);
	UnlockReleaseBuffer(buffer);
}
コード例 #9
0
ファイル: spgxlog.c プロジェクト: avontd2868/postgres
static void
spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record)
{
	char	   *ptr = XLogRecGetData(record);
	spgxlogAddLeaf *xldata = (spgxlogAddLeaf *) ptr;
	SpGistLeafTuple leafTuple;
	Buffer		buffer;
	Page		page;

	/* we assume this is adequately aligned */
	ptr += sizeof(spgxlogAddLeaf);
	leafTuple = (SpGistLeafTuple) ptr;

	if (!(record->xl_info & XLR_BKP_BLOCK_1))
	{
		buffer = XLogReadBuffer(xldata->node, xldata->blknoLeaf,
								xldata->newPage);
		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);

			if (xldata->newPage)
				SpGistInitBuffer(buffer, SPGIST_LEAF);

			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				/* insert new tuple */
				if (xldata->offnumLeaf != xldata->offnumHeadLeaf)
				{
					/* normal cases, tuple was added by SpGistPageAddNewItem */
					addOrReplaceTuple(page, (Item) leafTuple, leafTuple->size,
									  xldata->offnumLeaf);

					/* update head tuple's chain link if needed */
					if (xldata->offnumHeadLeaf != InvalidOffsetNumber)
					{
						SpGistLeafTuple head;

						head = (SpGistLeafTuple) PageGetItem(page,
														 PageGetItemId(page, xldata->offnumHeadLeaf));
						Assert(head->nextOffset == leafTuple->nextOffset);
						head->nextOffset = xldata->offnumLeaf;
					}
				}
				else
				{
					/* replacing a DEAD tuple */
					PageIndexTupleDelete(page, xldata->offnumLeaf);
					if (PageAddItem(page,
									(Item) leafTuple, leafTuple->size,
									xldata->offnumLeaf, false, false) != xldata->offnumLeaf)
						elog(ERROR, "failed to add item of size %u to SPGiST index page",
							 leafTuple->size);
				}

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}

	/* update parent downlink if necessary */
	if (xldata->blknoParent != InvalidBlockNumber &&
		!(record->xl_info & XLR_BKP_BLOCK_2))
	{
		buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false);
		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);
			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				SpGistInnerTuple tuple;

				tuple = (SpGistInnerTuple) PageGetItem(page,
													   PageGetItemId(page, xldata->offnumParent));

				spgUpdateNodeLink(tuple, xldata->nodeI,
								  xldata->blknoLeaf, xldata->offnumLeaf);

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}
}
コード例 #10
0
ファイル: spgxlog.c プロジェクト: avontd2868/postgres
static void
spgRedoVacuumRedirect(XLogRecPtr lsn, XLogRecord *record)
{
	char	   *ptr = XLogRecGetData(record);
	spgxlogVacuumRedirect *xldata = (spgxlogVacuumRedirect *) ptr;
	OffsetNumber *itemToPlaceholder;
	Buffer		buffer;
	Page		page;

	ptr += sizeof(spgxlogVacuumRedirect);
	itemToPlaceholder = (OffsetNumber *) ptr;

	if (!(record->xl_info & XLR_BKP_BLOCK_1))
	{
		buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);

		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);
			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
				int			i;

				/* Convert redirect pointers to plain placeholders */
				for (i = 0; i < xldata->nToPlaceholder; i++)
				{
					SpGistDeadTuple dt;

					dt = (SpGistDeadTuple) PageGetItem(page,
													   PageGetItemId(page, itemToPlaceholder[i]));
					Assert(dt->tupstate == SPGIST_REDIRECT);
					dt->tupstate = SPGIST_PLACEHOLDER;
					ItemPointerSetInvalid(&dt->pointer);
				}

				Assert(opaque->nRedirection >= xldata->nToPlaceholder);
				opaque->nRedirection -= xldata->nToPlaceholder;
				opaque->nPlaceholder += xldata->nToPlaceholder;

				/* Remove placeholder tuples at end of page */
				if (xldata->firstPlaceholder != InvalidOffsetNumber)
				{
					int			max = PageGetMaxOffsetNumber(page);
					OffsetNumber *toDelete;

					toDelete = palloc(sizeof(OffsetNumber) * max);

					for (i = xldata->firstPlaceholder; i <= max; i++)
						toDelete[i - xldata->firstPlaceholder] = i;

					i = max - xldata->firstPlaceholder + 1;
					Assert(opaque->nPlaceholder >= i);
					opaque->nPlaceholder -= i;

					/* The array is sorted, so can use PageIndexMultiDelete */
					PageIndexMultiDelete(page, toDelete, i);

					pfree(toDelete);
				}

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}

			UnlockReleaseBuffer(buffer);
		}
	}
}
コード例 #11
0
ファイル: spgxlog.c プロジェクト: avontd2868/postgres
static void
spgRedoVacuumLeaf(XLogRecPtr lsn, XLogRecord *record)
{
	char	   *ptr = XLogRecGetData(record);
	spgxlogVacuumLeaf *xldata = (spgxlogVacuumLeaf *) ptr;
	OffsetNumber *toDead;
	OffsetNumber *toPlaceholder;
	OffsetNumber *moveSrc;
	OffsetNumber *moveDest;
	OffsetNumber *chainSrc;
	OffsetNumber *chainDest;
	SpGistState state;
	Buffer		buffer;
	Page		page;
	int			i;

	fillFakeState(&state, xldata->stateSrc);

	ptr += sizeof(spgxlogVacuumLeaf);
	toDead = (OffsetNumber *) ptr;
	ptr += sizeof(OffsetNumber) * xldata->nDead;
	toPlaceholder = (OffsetNumber *) ptr;
	ptr += sizeof(OffsetNumber) * xldata->nPlaceholder;
	moveSrc = (OffsetNumber *) ptr;
	ptr += sizeof(OffsetNumber) * xldata->nMove;
	moveDest = (OffsetNumber *) ptr;
	ptr += sizeof(OffsetNumber) * xldata->nMove;
	chainSrc = (OffsetNumber *) ptr;
	ptr += sizeof(OffsetNumber) * xldata->nChain;
	chainDest = (OffsetNumber *) ptr;

	if (!(record->xl_info & XLR_BKP_BLOCK_1))
	{
		buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);
			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				spgPageIndexMultiDelete(&state, page,
										toDead, xldata->nDead,
										SPGIST_DEAD, SPGIST_DEAD,
										InvalidBlockNumber,
										InvalidOffsetNumber);

				spgPageIndexMultiDelete(&state, page,
										toPlaceholder, xldata->nPlaceholder,
										SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
										InvalidBlockNumber,
										InvalidOffsetNumber);

				/* see comments in vacuumLeafPage() */
				for (i = 0; i < xldata->nMove; i++)
				{
					ItemId		idSrc = PageGetItemId(page, moveSrc[i]);
					ItemId		idDest = PageGetItemId(page, moveDest[i]);
					ItemIdData	tmp;

					tmp = *idSrc;
					*idSrc = *idDest;
					*idDest = tmp;
				}

				spgPageIndexMultiDelete(&state, page,
										moveSrc, xldata->nMove,
										SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
										InvalidBlockNumber,
										InvalidOffsetNumber);

				for (i = 0; i < xldata->nChain; i++)
				{
					SpGistLeafTuple lt;

					lt = (SpGistLeafTuple) PageGetItem(page,
										   PageGetItemId(page, chainSrc[i]));
					Assert(lt->tupstate == SPGIST_LIVE);
					lt->nextOffset = chainDest[i];
				}

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}
}
コード例 #12
0
ファイル: spgxlog.c プロジェクト: avontd2868/postgres
static void
spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record)
{
	char	   *ptr = XLogRecGetData(record);
	spgxlogPickSplit *xldata = (spgxlogPickSplit *) ptr;
	SpGistInnerTuple innerTuple;
	SpGistState state;
	OffsetNumber *toDelete;
	OffsetNumber *toInsert;
	uint8	   *leafPageSelect;
	Buffer		srcBuffer;
	Buffer		destBuffer;
	Page		page;
	int			bbi;
	int			i;

	fillFakeState(&state, xldata->stateSrc);

	ptr += MAXALIGN(sizeof(spgxlogPickSplit));
	innerTuple = (SpGistInnerTuple) ptr;
	ptr += innerTuple->size;
	toDelete = (OffsetNumber *) ptr;
	ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nDelete);
	toInsert = (OffsetNumber *) ptr;
	ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nInsert);
	leafPageSelect = (uint8 *) ptr;
	ptr += MAXALIGN(sizeof(uint8) * xldata->nInsert);

	/* now ptr points to the list of leaf tuples */

	/*
	 * It's a bit tricky to identify which pages have been handled as
	 * full-page images, so we explicitly count each referenced buffer.
	 */
	bbi = 0;

	if (xldata->blknoSrc == SPGIST_HEAD_BLKNO)
	{
		/* when splitting root, we touch it only in the guise of new inner */
		srcBuffer = InvalidBuffer;
	}
	else if (xldata->initSrc)
	{
		/* just re-init the source page */
		srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, true);
		Assert(BufferIsValid(srcBuffer));
		page = (Page) BufferGetPage(srcBuffer);

		SpGistInitBuffer(srcBuffer, SPGIST_LEAF);
		/* don't update LSN etc till we're done with it */
	}
	else
	{
		/* delete the specified tuples from source page */
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
		{
			srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false);
			if (BufferIsValid(srcBuffer))
			{
				page = BufferGetPage(srcBuffer);
				if (!XLByteLE(lsn, PageGetLSN(page)))
				{
					/*
					 * We have it a bit easier here than in doPickSplit(),
					 * because we know the inner tuple's location already,
					 * so we can inject the correct redirection tuple now.
					 */
					if (!state.isBuild)
						spgPageIndexMultiDelete(&state, page,
												toDelete, xldata->nDelete,
												SPGIST_REDIRECT,
												SPGIST_PLACEHOLDER,
												xldata->blknoInner,
												xldata->offnumInner);
					else
						spgPageIndexMultiDelete(&state, page,
												toDelete, xldata->nDelete,
												SPGIST_PLACEHOLDER,
												SPGIST_PLACEHOLDER,
												InvalidBlockNumber,
												InvalidOffsetNumber);

					/* don't update LSN etc till we're done with it */
				}
			}
		}
		else
			srcBuffer = InvalidBuffer;
		bbi++;
	}

	/* try to access dest page if any */
	if (xldata->blknoDest == InvalidBlockNumber)
	{
		destBuffer = InvalidBuffer;
	}
	else if (xldata->initDest)
	{
		/* just re-init the dest page */
		destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, true);
		Assert(BufferIsValid(destBuffer));
		page = (Page) BufferGetPage(destBuffer);

		SpGistInitBuffer(destBuffer, SPGIST_LEAF);
		/* don't update LSN etc till we're done with it */
	}
	else
	{
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
			destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, false);
		else
			destBuffer = InvalidBuffer;
		bbi++;
	}

	/* restore leaf tuples to src and/or dest page */
	for (i = 0; i < xldata->nInsert; i++)
	{
		SpGistLeafTuple lt = (SpGistLeafTuple) ptr;
		Buffer		leafBuffer;

		ptr += lt->size;

		leafBuffer = leafPageSelect[i] ? destBuffer : srcBuffer;
		if (!BufferIsValid(leafBuffer))
			continue;			/* no need to touch this page */
		page = BufferGetPage(leafBuffer);

		if (!XLByteLE(lsn, PageGetLSN(page)))
		{
			addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]);
		}
	}

	/* Now update src and dest page LSNs */
	if (BufferIsValid(srcBuffer))
	{
		page = BufferGetPage(srcBuffer);
		if (!XLByteLE(lsn, PageGetLSN(page)))
		{
			PageSetLSN(page, lsn);
			PageSetTLI(page, ThisTimeLineID);
			MarkBufferDirty(srcBuffer);
		}
		UnlockReleaseBuffer(srcBuffer);
	}
	if (BufferIsValid(destBuffer))
	{
		page = BufferGetPage(destBuffer);
		if (!XLByteLE(lsn, PageGetLSN(page)))
		{
			PageSetLSN(page, lsn);
			PageSetTLI(page, ThisTimeLineID);
			MarkBufferDirty(destBuffer);
		}
		UnlockReleaseBuffer(destBuffer);
	}

	/* restore new inner tuple */
	if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
	{
		Buffer		buffer = XLogReadBuffer(xldata->node, xldata->blknoInner,
											xldata->initInner);

		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);

			if (xldata->initInner)
				SpGistInitBuffer(buffer, 0);

			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				addOrReplaceTuple(page, (Item) innerTuple, innerTuple->size,
								  xldata->offnumInner);

				/* if inner is also parent, update link while we're here */
				if (xldata->blknoInner == xldata->blknoParent)
				{
					SpGistInnerTuple parent;

					parent = (SpGistInnerTuple) PageGetItem(page,
									PageGetItemId(page, xldata->offnumParent));
					spgUpdateNodeLink(parent, xldata->nodeI,
									  xldata->blknoInner, xldata->offnumInner);
				}

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}
	bbi++;

	/* update parent downlink, unless we did it above */
	if (xldata->blknoParent == InvalidBlockNumber)
	{
		/* no parent cause we split the root */
		Assert(xldata->blknoInner == SPGIST_HEAD_BLKNO);
	}
	else if (xldata->blknoInner != xldata->blknoParent)
	{
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
		{
			Buffer		buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false);

			if (BufferIsValid(buffer))
			{
				page = BufferGetPage(buffer);

				if (!XLByteLE(lsn, PageGetLSN(page)))
				{
					SpGistInnerTuple parent;

					parent = (SpGistInnerTuple) PageGetItem(page,
									PageGetItemId(page, xldata->offnumParent));
					spgUpdateNodeLink(parent, xldata->nodeI,
									  xldata->blknoInner, xldata->offnumInner);

					PageSetLSN(page, lsn);
					PageSetTLI(page, ThisTimeLineID);
					MarkBufferDirty(buffer);
				}
				UnlockReleaseBuffer(buffer);
			}
		}
	}
}
コード例 #13
0
ファイル: spgxlog.c プロジェクト: avontd2868/postgres
static void
spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record)
{
	char	   *ptr = XLogRecGetData(record);
	spgxlogSplitTuple *xldata = (spgxlogSplitTuple *) ptr;
	SpGistInnerTuple prefixTuple;
	SpGistInnerTuple postfixTuple;
	Buffer		buffer;
	Page		page;

	/* we assume this is adequately aligned */
	ptr += sizeof(spgxlogSplitTuple);
	prefixTuple = (SpGistInnerTuple) ptr;
	ptr += prefixTuple->size;
	postfixTuple = (SpGistInnerTuple) ptr;

	/* insert postfix tuple first to avoid dangling link */
	if (xldata->blknoPostfix != xldata->blknoPrefix &&
		!(record->xl_info & XLR_BKP_BLOCK_2))
	{
		buffer = XLogReadBuffer(xldata->node, xldata->blknoPostfix,
								xldata->newPage);
		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);

			if (xldata->newPage)
				SpGistInitBuffer(buffer, 0);

			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				addOrReplaceTuple(page, (Item) postfixTuple,
								  postfixTuple->size, xldata->offnumPostfix);

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}

	/* now handle the original page */
	if (!(record->xl_info & XLR_BKP_BLOCK_1))
	{
		buffer = XLogReadBuffer(xldata->node, xldata->blknoPrefix, false);
		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);
			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				PageIndexTupleDelete(page, xldata->offnumPrefix);
				if (PageAddItem(page, (Item) prefixTuple, prefixTuple->size,
								xldata->offnumPrefix, false, false) != xldata->offnumPrefix)
					elog(ERROR, "failed to add item of size %u to SPGiST index page",
						 prefixTuple->size);

				if (xldata->blknoPostfix == xldata->blknoPrefix)
					addOrReplaceTuple(page, (Item) postfixTuple,
									  postfixTuple->size,
									  xldata->offnumPostfix);

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}
}
コード例 #14
0
ファイル: spgxlog.c プロジェクト: avontd2868/postgres
static void
spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record)
{
	char	   *ptr = XLogRecGetData(record);
	spgxlogAddNode *xldata = (spgxlogAddNode *) ptr;
	SpGistInnerTuple innerTuple;
	SpGistState state;
	Buffer		buffer;
	Page		page;
	int			bbi;

	/* we assume this is adequately aligned */
	ptr += sizeof(spgxlogAddNode);
	innerTuple = (SpGistInnerTuple) ptr;

	fillFakeState(&state, xldata->stateSrc);

	if (xldata->blknoNew == InvalidBlockNumber)
	{
		/* update in place */
		Assert(xldata->blknoParent == InvalidBlockNumber);
		if (!(record->xl_info & XLR_BKP_BLOCK_1))
		{
			buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
			if (BufferIsValid(buffer))
			{
				page = BufferGetPage(buffer);
				if (!XLByteLE(lsn, PageGetLSN(page)))
				{
					PageIndexTupleDelete(page, xldata->offnum);
					if (PageAddItem(page, (Item) innerTuple, innerTuple->size,
									xldata->offnum,
									false, false) != xldata->offnum)
						elog(ERROR, "failed to add item of size %u to SPGiST index page",
							 innerTuple->size);

					PageSetLSN(page, lsn);
					PageSetTLI(page, ThisTimeLineID);
					MarkBufferDirty(buffer);
				}
				UnlockReleaseBuffer(buffer);
			}
		}
	}
	else
	{
		/* Install new tuple first so redirect is valid */
		if (!(record->xl_info & XLR_BKP_BLOCK_2))
		{
			buffer = XLogReadBuffer(xldata->node, xldata->blknoNew,
									xldata->newPage);
			if (BufferIsValid(buffer))
			{
				page = BufferGetPage(buffer);

				if (xldata->newPage)
					SpGistInitBuffer(buffer, 0);

				if (!XLByteLE(lsn, PageGetLSN(page)))
				{
					addOrReplaceTuple(page, (Item) innerTuple,
									  innerTuple->size, xldata->offnumNew);

					PageSetLSN(page, lsn);
					PageSetTLI(page, ThisTimeLineID);
					MarkBufferDirty(buffer);
				}
				UnlockReleaseBuffer(buffer);
			}
		}

		/* Delete old tuple, replacing it with redirect or placeholder tuple */
		if (!(record->xl_info & XLR_BKP_BLOCK_1))
		{
			buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
			if (BufferIsValid(buffer))
			{
				page = BufferGetPage(buffer);
				if (!XLByteLE(lsn, PageGetLSN(page)))
				{
					SpGistDeadTuple dt;

					if (state.isBuild)
						dt = spgFormDeadTuple(&state, SPGIST_PLACEHOLDER,
											  InvalidBlockNumber,
											  InvalidOffsetNumber);
					else
						dt = spgFormDeadTuple(&state, SPGIST_REDIRECT,
											  xldata->blknoNew,
											  xldata->offnumNew);

					PageIndexTupleDelete(page, xldata->offnum);
					if (PageAddItem(page, (Item) dt, dt->size,
									xldata->offnum,
									false, false) != xldata->offnum)
						elog(ERROR, "failed to add item of size %u to SPGiST index page",
							 dt->size);

					if (state.isBuild)
						SpGistPageGetOpaque(page)->nPlaceholder++;
					else
						SpGistPageGetOpaque(page)->nRedirection++;

					PageSetLSN(page, lsn);
					PageSetTLI(page, ThisTimeLineID);
					MarkBufferDirty(buffer);
				}
				UnlockReleaseBuffer(buffer);
			}
		}

		/*
		 * Update parent downlink.  Since parent could be in either of the
		 * previous two buffers, it's a bit tricky to determine which BKP bit
		 * applies.
		 */
		if (xldata->blknoParent == xldata->blkno)
			bbi = 0;
		else if (xldata->blknoParent == xldata->blknoNew)
			bbi = 1;
		else
			bbi = 2;

		if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
		{
			buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false);
			if (BufferIsValid(buffer))
			{
				page = BufferGetPage(buffer);
				if (!XLByteLE(lsn, PageGetLSN(page)))
				{
					SpGistInnerTuple innerTuple;

					innerTuple = (SpGistInnerTuple) PageGetItem(page,
																PageGetItemId(page, xldata->offnumParent));

					spgUpdateNodeLink(innerTuple, xldata->nodeI,
									  xldata->blknoNew, xldata->offnumNew);

					PageSetLSN(page, lsn);
					PageSetTLI(page, ThisTimeLineID);
					MarkBufferDirty(buffer);
				}
				UnlockReleaseBuffer(buffer);
			}
		}
	}
}
コード例 #15
0
ファイル: spgxlog.c プロジェクト: avontd2868/postgres
static void
spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record)
{
	char	   *ptr = XLogRecGetData(record);
	spgxlogMoveLeafs *xldata = (spgxlogMoveLeafs *) ptr;
	SpGistState state;
	OffsetNumber *toDelete;
	OffsetNumber *toInsert;
	int			nInsert;
	Buffer		buffer;
	Page		page;

	fillFakeState(&state, xldata->stateSrc);

	nInsert = xldata->replaceDead ? 1 : xldata->nMoves + 1;

	ptr += MAXALIGN(sizeof(spgxlogMoveLeafs));
	toDelete = (OffsetNumber *) ptr;
	ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nMoves);
	toInsert = (OffsetNumber *) ptr;
	ptr += MAXALIGN(sizeof(OffsetNumber) * nInsert);

	/* now ptr points to the list of leaf tuples */

	/* Insert tuples on the dest page (do first, so redirect is valid) */
	if (!(record->xl_info & XLR_BKP_BLOCK_2))
	{
		buffer = XLogReadBuffer(xldata->node, xldata->blknoDst,
								xldata->newPage);
		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);

			if (xldata->newPage)
				SpGistInitBuffer(buffer, SPGIST_LEAF);

			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				int			i;

				for (i = 0; i < nInsert; i++)
				{
					SpGistLeafTuple lt = (SpGistLeafTuple) ptr;

					addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]);
					ptr += lt->size;
				}

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}

	/* Delete tuples from the source page, inserting a redirection pointer */
	if (!(record->xl_info & XLR_BKP_BLOCK_1))
	{
		buffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false);
		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);
			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				spgPageIndexMultiDelete(&state, page, toDelete, xldata->nMoves,
										state.isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT,
										SPGIST_PLACEHOLDER,
										xldata->blknoDst,
										toInsert[nInsert - 1]);

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}

	/* And update the parent downlink */
	if (!(record->xl_info & XLR_BKP_BLOCK_3))
	{
		buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false);
		if (BufferIsValid(buffer))
		{
			page = BufferGetPage(buffer);
			if (!XLByteLE(lsn, PageGetLSN(page)))
			{
				SpGistInnerTuple tuple;

				tuple = (SpGistInnerTuple) PageGetItem(page,
													   PageGetItemId(page, xldata->offnumParent));

				spgUpdateNodeLink(tuple, xldata->nodeI,
								  xldata->blknoDst, toInsert[nInsert - 1]);

				PageSetLSN(page, lsn);
				PageSetTLI(page, ThisTimeLineID);
				MarkBufferDirty(buffer);
			}
			UnlockReleaseBuffer(buffer);
		}
	}
}
コード例 #16
0
ファイル: repmgrd.c プロジェクト: klando/repmgr
static void
do_failover(void)
{
	PGresult *res1;
	PGresult *res2;
	char 	sqlquery[8192];

	int		total_nodes = 0;
	int		visible_nodes = 0;
	bool	find_best = false;

	int		i;
	int		r;

	int 	node;
	char	nodeConninfo[MAXLEN];

	unsigned int uxlogid;
	unsigned int uxrecoff;
	char last_wal_standby_applied[MAXLEN];

	PGconn	*nodeConn = NULL;

	/*
	 * will get info about until 50 nodes,
	 * which seems to be large enough for most scenarios
	 */
	nodeInfo nodes[50];
	nodeInfo best_candidate;

	/* first we get info about this node, and update shared memory */
	sprintf(sqlquery, "SELECT pg_last_xlog_replay_location()");
	res1 = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
		PQclear(res1);
		sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
		update_shared_memory(last_wal_standby_applied);
		exit(ERR_DB_QUERY);
	}

	/* write last location in shared memory */
	update_shared_memory(PQgetvalue(res1, 0, 0));

	/*
	 * we sleep the monitor time + one second
	 * we bet it should be enough for other repmgrd to update their own data
	 */
	sleep(SLEEP_MONITOR + 1);

	/* get a list of standby nodes, including myself */
	sprintf(sqlquery, "SELECT id, conninfo "
	        "  FROM %s.repl_nodes "
	        " WHERE id IN (SELECT standby_node FROM %s.repl_status) "
	        "   AND cluster = '%s' "
	        " ORDER BY priority ",
	        repmgr_schema, repmgr_schema, local_options.cluster_name);

	res1 = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
	{
		log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn));
		PQclear(res1);
		PQfinish(myLocalConn);
		exit(ERR_DB_QUERY);
	}

	/* ask for the locations */
	for (i = 0; i < PQntuples(res1); i++)
	{
		node = atoi(PQgetvalue(res1, i, 0));
		/* Initialize on false so if we can't reach this node we know that later */
		nodes[i].is_ready = false;
		strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN);
		nodeConn = establishDBConnection(nodeConninfo, false);
		/* if we can't see the node just skip it */
		if (PQstatus(nodeConn) != CONNECTION_OK)
			continue;

		sqlquery_snprintf(sqlquery, "SELECT repmgr_get_last_standby_location()");
		res2 = PQexec(nodeConn, sqlquery);
		if (PQresultStatus(res2) != PGRES_TUPLES_OK)
		{
			log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
			log_info(_("Connection details: %s\n"), nodeConninfo);
			PQclear(res2);
			PQfinish(nodeConn);
			continue;
		}

		visible_nodes++;

		if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
			log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0));

		nodes[i].nodeId = node;
		nodes[i].xlog_location.xlogid = uxlogid;
		nodes[i].xlog_location.xrecoff = uxrecoff;
		nodes[i].is_ready = true;

		PQclear(res2);
		PQfinish(nodeConn);
	}
	PQclear(res1);
	/* Close the connection to this server */
	PQfinish(myLocalConn);

	/*
	 * total nodes that are registered, include master which is a node but was
	 * not counted because it's not a standby
	 */
	total_nodes = i + 1;

	/*
	 * am i on the group that should keep alive?
	 * if i see less than half of total_nodes then i should do nothing
	 */
	if (visible_nodes < (total_nodes / 2.0))
	{
		log_err(_("Can't reach most of the nodes.\n"
		          "Let the other standby servers decide which one will be the primary.\n"
		          "Manual action will be needed to readd this node to the cluster.\n"));
		exit(ERR_FAILOVER_FAIL);
	}

	/*
	 * determine which one is the best candidate to promote to primary
	 */
	for (i = 0; i < total_nodes - 1; i++)
	{
		if (!nodes[i].is_ready)
			continue;
		else if (!find_best)
		{
			/* start with the first ready node, and then move on to the next one */
			best_candidate.nodeId                = nodes[i].nodeId;
			best_candidate.xlog_location.xlogid  = nodes[i].xlog_location.xlogid;
			best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
			best_candidate.is_ready              = nodes[i].is_ready;
			find_best = true;
		}

		/* we use the macros provided by xlogdefs.h to compare XLogPtr */
		/*
		 * Nodes are retrieved ordered by priority, so if the current
		 * best candidate is lower or equal to the next node's wal location
		 * then assign next node as the new best candidate.
		 */
		if (XLByteLE(best_candidate.xlog_location, nodes[i].xlog_location))
		{
			best_candidate.nodeId                = nodes[i].nodeId;
			best_candidate.xlog_location.xlogid  = nodes[i].xlog_location.xlogid;
			best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
			best_candidate.is_ready              = nodes[i].is_ready;
		}
	}

	/* once we know who is the best candidate, promote it */
	if (find_best && (best_candidate.nodeId == local_options.node))
	{
		if (verbose)
			log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"),
			         progname);
		log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command);
		r = system(local_options.promote_command);
		if (r != 0)
		{
			log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname);
			exit(ERR_BAD_CONFIG);
		}
	}
	else if (find_best)
	{
		if (verbose)
			log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"),
			         progname, best_candidate.nodeId);
		log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command);
		/*
		 * New Primary need some time to be promoted.
		 * The follow command should take care of that.
		 */
		r = system(local_options.follow_command);
		if (r != 0)
		{
			log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname);
			exit(ERR_BAD_CONFIG);
		}
	}
	else
	{
		log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname);
		exit(ERR_FAILOVER_FAIL);
	}

	/* and reconnect to the local database */
	myLocalConn = establishDBConnection(local_options.conninfo, true);
}