/*
 * Returns the replication apply delay in ms
 */
int
GetReplicationApplyDelay(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile WalRcvData *walrcv = WalRcv;

	XLogRecPtr	receivePtr;
	XLogRecPtr	replayPtr;

	long		secs;
	int			usecs;

	SpinLockAcquire(&walrcv->mutex);
	receivePtr = walrcv->receivedUpto;
	SpinLockRelease(&walrcv->mutex);

	replayPtr = GetXLogReplayRecPtr(NULL);

	if (XLByteEQ(receivePtr, replayPtr))
		return 0;

	TimestampDifference(GetCurrentChunkReplayStartTime(),
						GetCurrentTimestamp(),
						&secs, &usecs);

	return (((int) secs * 1000) + (usecs / 1000));
}
Exemple #2
0
/*
 * Send reply message to primary, indicating our current XLOG positions and
 * the current time.
 */
static void
XLogWalRcvSendReply(void)
{
	char		buf[sizeof(StandbyReplyMessage) + 1];
	TimestampTz	now;

	/*
	 * If the user doesn't want status to be reported to the master, be sure
	 * to exit before doing anything at all.
	 */
	if (wal_receiver_status_interval <= 0)
		return;

	/* Get current timestamp. */
	now = GetCurrentTimestamp();

	/*
	 * We can compare the write and flush positions to the last message we
	 * sent without taking any lock, but the apply position requires a spin
	 * lock, so we don't check that unless something else has changed or 10
	 * seconds have passed.  This means that the apply log position will
	 * appear, from the master's point of view, to lag slightly, but since
	 * this is only for reporting purposes and only on idle systems, that's
	 * probably OK.
	 */
	if (XLByteEQ(reply_message.write, LogstreamResult.Write)
		&& XLByteEQ(reply_message.flush, LogstreamResult.Flush)
		&& !TimestampDifferenceExceeds(reply_message.sendTime, now,
			wal_receiver_status_interval * 1000))
		return;

	/* Construct a new message. */
	reply_message.write = LogstreamResult.Write;
	reply_message.flush = LogstreamResult.Flush;
	reply_message.apply = GetXLogReplayRecPtr();
	reply_message.sendTime = now;

	elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X",
				 reply_message.write.xlogid, reply_message.write.xrecoff,
				 reply_message.flush.xlogid, reply_message.flush.xrecoff,
				 reply_message.apply.xlogid, reply_message.apply.xrecoff);

	/* Prepend with the message type and send it. */
	buf[0] = 'r';
	memcpy(&buf[1], &reply_message, sizeof(StandbyReplyMessage));
	walrcv_send(buf, sizeof(StandbyReplyMessage) + 1);
}
Exemple #3
0
static void
killtuple(Relation r, GISTScanOpaque so, ItemPointer iptr)
{
	MIRROREDLOCK_BUFMGR_DECLARE;

	Page        p;
	OffsetNumber offset;

	// -------- MirroredLock ----------
	MIRROREDLOCK_BUFMGR_LOCK;

	LockBuffer(so->curbuf, GIST_SHARE);
	gistcheckpage(r, so->curbuf);
	p = (Page) BufferGetPage(so->curbuf);

	if (XLByteEQ(so->stack->lsn, PageGetLSN(p)))
	{
		/* page unchanged, so all is simple */
		offset = ItemPointerGetOffsetNumber(iptr);
		ItemIdMarkDead(PageGetItemId(p, offset));
		SetBufferCommitInfoNeedsSave(so->curbuf);
	}
	else
	{
		OffsetNumber maxoff = PageGetMaxOffsetNumber(p);

		for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
		{
			IndexTuple  ituple = (IndexTuple) PageGetItem(p, PageGetItemId(p, offset));

			if (ItemPointerEquals(&(ituple->t_tid), iptr))
			{
				/* found */
				ItemIdMarkDead(PageGetItemId(p, offset));
				SetBufferCommitInfoNeedsSave(so->curbuf);
				break;
			}
		}
	}

	LockBuffer(so->curbuf, GIST_UNLOCK);

	MIRROREDLOCK_BUFMGR_UNLOCK;
	// -------- MirroredLock ----------
}
static int
FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request)
{
	int				status = STATUS_OK;
	Page			page;
	Buffer			buf; 
	BlockNumber		numBlocks = 0;
	SMgrRelation	smgr_relation = NULL;
	char			relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1];
	int				ii;
	XLogRecPtr		loc;
	XLogRecPtr		loc1;
	int				count = 0;
	int				thresholdCount = 0;
	bool			mirrorDataLossOccurred = FALSE;
	int				NumberOfRelations = request->count;
	
	FileRepResyncHashEntry_s	entry;
	ChangeTrackingResult		*result = NULL;	

	while (1)
	{
		/* allow flushing buffers from buffer pool during scan */
		FileRepResync_SetReadBufferRequest();
		if ((result = ChangeTracking_GetChanges(request)) != NULL) 
		{
			FileRepResync_ResetReadBufferRequest();
					
			for (ii = 0; ii < result->count; ii++)
			{
				
				if (smgr_relation == NULL)
				{
					NumberOfRelations--;
					
					smgr_relation = smgropen(result->entries[ii].relFileNode);
					
					snprintf(relidstr, sizeof(relidstr), "%u/%u/%u",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode);

					numBlocks = smgrnblocks(smgr_relation);
					
					if (Debug_filerep_print)
						elog(LOG, "resynchronize buffer pool relation '%u/%u/%u' "
							 "number of blocks:'%u' ",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode,
							 numBlocks);
					
					thresholdCount = Min(numBlocks, 1024);
				}
				
				loc1 =  result->entries[ii].lsn_end;
				
				/*
				 * if relation was truncated then block_num from change tracking can be beyond numBlocks 
				 */
				if (result->entries[ii].block_num >=  numBlocks)
				{
					ereport(LOG,	
							(errmsg("could not resynchonize buffer pool relation '%s' block '%d' (maybe due to truncate), "
									"lsn change tracking '%s(%u/%u)' "
									"number of blocks '%d' ",
									relidstr,
									result->entries[ii].block_num,
									XLogLocationToString(&loc1),
									loc1.xlogid,
									loc1.xrecoff,
									numBlocks),						
							 FileRep_errcontext()));						
					
					goto flush_check;
				}
				
				/* allow flushing buffers from buffer pool during scan */
				FileRepResync_SetReadBufferRequest();
				buf = ReadBuffer_Resync(smgr_relation,
										result->entries[ii].block_num,
										relidstr);
				FileRepResync_ResetReadBufferRequest();
				
				Assert(result->entries[ii].block_num < numBlocks);
				
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				page = BufferGetPage(buf);
				
				loc = PageGetLSN(page); 
				
				if(Debug_filerep_print)
				{
					elog(LOG,	
							"incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' "
							"lsn end change tracking '%s(%u/%u)' ",
							relidstr,
							numBlocks,
							result->entries[ii].block_num,
							XLogLocationToString(&loc),
							loc.xlogid,
							loc.xrecoff,
							XLogLocationToString(&loc1),
							result->entries[ii].lsn_end.xlogid,
							result->entries[ii].lsn_end.xrecoff);					
				}
				else
				{
					char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];
					
					snprintf(tmpBuf, sizeof(tmpBuf), 
							 "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' ",
							 relidstr,
							 numBlocks,
							 result->entries[ii].block_num,
							 XLogLocationToString(&loc),
							 loc.xlogid,
							 loc.xrecoff);
					
					FileRep_InsertConfigLogEntry(tmpBuf);
					
					snprintf(tmpBuf, sizeof(tmpBuf), 
							 "incremental resync buffer pool identifier '%s' lsn end change tracking '%s(%u/%u)' ",
							 relidstr,
							 XLogLocationToString(&loc1),
							 result->entries[ii].lsn_end.xlogid,
							 result->entries[ii].lsn_end.xrecoff);
					
					FileRep_InsertConfigLogEntry(tmpBuf);
					
				}
								
				if (XLByteLE(result->entries[ii].lsn_end, PageGetLSN(page)))
				{
					if (! XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end))
					{
						ereport(LOG,
							(errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn less than CT lsn, "
								"lsn end change tracking '%s(%u/%u)' lsn page '%s(%u/%u)' "
								"number of blocks '%d'",
								relidstr,
								result->entries[ii].block_num,
								XLogLocationToString(&loc),
								loc.xlogid,
								loc.xrecoff,
								XLogLocationToString(&loc1),
								loc1.xlogid,
								loc1.xrecoff,
								numBlocks),
							 FileRep_errcontext()));

					}

					/*
					 * It's safe and better to perform write of the page to mirror,
					 * for this case, as primary and mirror data pages should always
					 * be same. So, we might do some extra work but definitely won't
					 * loose out blocks, or error out and need to perform full recovery.
					 * Need to cover for this case as there are some known scenarios where
					 * CT file can have extra records which should have been discarded,
					 * but as we loose out information of xlog LSN cannot be discarded.
					 * One such case is when CT_TRANSIENT being compacted to CT_COMPACT
					 * with specific xlog LSN (to discard extra records) in CT mode gets
					 * interrupted by resync. Compaction during Resync collects all the
					 * CT records and doesn't have xlog LSN information to discard any
					 * extra records from CT_TRANSIENT.
					 */

					smgrwrite(smgr_relation,
							  result->entries[ii].block_num,
							  (char *)BufferGetBlock(buf),
							  FALSE);
				}

#ifdef FAULT_INJECTOR	
				FaultInjector_InjectFaultIfSet(
											   FileRepResyncWorker, 
											   DDLNotSpecified,
											   "",	// databaseName
											   ""); // tableName
#endif				
				
				UnlockReleaseBuffer(buf);
				
#ifdef FAULT_INJECTOR	
				FaultInjector_InjectFaultIfSet(
											   FileRepResyncWorker, 
											   DDLNotSpecified,
											   "",	// databaseName
											   ""); // tableName
#endif				
		
	flush_check:			
				if (((ii + 1) == result->count) ||
					! (result->entries[ii].relFileNode.spcNode == result->entries[ii+1].relFileNode.spcNode &&
					   result->entries[ii].relFileNode.dbNode == result->entries[ii+1].relFileNode.dbNode &&
					   result->entries[ii].relFileNode.relNode == result->entries[ii+1].relFileNode.relNode))
				{
					if (result->ask_for_more == false)
					{
								
						smgrimmedsync(smgr_relation);
						
						smgrclose(smgr_relation);
								 
						smgr_relation = NULL;
							
						FileRep_GetRelationPath(
												 entry.fileName, 
												 result->entries[ii].relFileNode, 
												 0 /* segment file number is always 0 for Buffer Pool */);							 
								 
						status = FileRepResync_UpdateEntry(&entry);
						if (status != STATUS_OK)
						{
							 break;
						}
					}
								 
				}			
							
				if (count > thresholdCount)
				{
					count = 0;
					FileRepSubProcess_ProcessSignals();
					
					if (! (FileRepSubProcess_GetState() == FileRepStateReady && 
						   dataState == DataStateInResync))
					{
						mirrorDataLossOccurred = TRUE;
						break;
					}
				}
				else
					count++;
			}  // for (ii = 0; ii < result->count; ii++)
			
		} // if ((result = ChangeTracking_GetChanges(request)) != NULL) 
		
		FileRepResync_ResetReadBufferRequest();
			
		if (result != NULL && result->ask_for_more == true)
		{
			Assert(request->count == 1);
			request->entries[0].lsn_start = result->next_start_lsn;
		}
		else
		{
			break;
		}

	} // while(1) 
		
	ChangeTracking_FreeRequest(request);
	ChangeTracking_FreeResult(result);
	
	Insist(NumberOfRelations == 0);
	
	if (mirrorDataLossOccurred)
		status = STATUS_ERROR;
	
	return status;	
}
Exemple #5
0
static void
dumpXLogRecord(XLogRecord *record, bool header_only)
{
	uint8	info = record->xl_info & ~XLR_INFO_MASK;

	/* check if the user wants a specific rmid */
	if (rmid>=0 && record->xl_rmid!=rmid)
		return;

	if (xid!=InvalidTransactionId && xid!=record->xl_xid)
		return;

#ifdef NOT_USED
	printf("%u/%08X: prv %u/%08X",
		   curRecPtr.xlogid, curRecPtr.xrecoff,
		   record->xl_prev.xlogid, record->xl_prev.xrecoff);

	if (!XLByteEQ(record->xl_prev, prevRecPtr))
		printf("(?)");

	printf("; xid %u; ", record->xl_xid);

	if (record->xl_rmid <= RM_MAX_ID)
		printf("%s", RM_names[record->xl_rmid]);
	else
		printf("RM %2d", record->xl_rmid);

	printf(" info %02X len %u tot_len %u\n", record->xl_info,
		   record->xl_len, record->xl_tot_len);
#endif

	if (header_only)
	{
		printf(" ** maybe continues to next segment **\n");
		return;
	}

	/*
	 * See rmgr.h for more details about the built-in resource managers.
	 */
	xlogstats.rmgr_count[record->xl_rmid]++;
	xlogstats.rmgr_len[record->xl_rmid] += record->xl_len;
	switch (record->xl_rmid)
	{
		case RM_XLOG_ID:
			print_rmgr_xlog(curRecPtr, record, info, hideTimestamps);
			break;
		case RM_XACT_ID:
			print_rmgr_xact(curRecPtr, record, info, hideTimestamps);
			break;
		case RM_SMGR_ID:
			print_rmgr_smgr(curRecPtr, record, info);
			break;
		case RM_CLOG_ID:
			print_rmgr_clog(curRecPtr, record, info);
			break;
		case RM_DBASE_ID:
			print_rmgr_dbase(curRecPtr, record, info);
			break;
		case RM_TBLSPC_ID:
			print_rmgr_tblspc(curRecPtr, record, info);
			break;
		case RM_MULTIXACT_ID:
			print_rmgr_multixact(curRecPtr, record, info);
			break;
#if PG_VERSION_NUM >= 90000
		case RM_RELMAP_ID:
			print_rmgr_relmap(curRecPtr, record, info);
			break;
		case RM_STANDBY_ID:
			print_rmgr_standby(curRecPtr, record, info);
			break;
#endif
		case RM_HEAP2_ID:
			print_rmgr_heap2(curRecPtr, record, info);
			break;
		case RM_HEAP_ID:
			print_rmgr_heap(curRecPtr, record, info, statements);
			break;
		case RM_BTREE_ID:
			print_rmgr_btree(curRecPtr, record, info);
			break;
		case RM_HASH_ID:
			print_rmgr_hash(curRecPtr, record, info);
			break;
		case RM_GIN_ID:
			print_rmgr_gin(curRecPtr, record, info);
			break;
		case RM_GIST_ID:
			print_rmgr_gist(curRecPtr, record, info);
			break;
		case RM_SEQ_ID:
			print_rmgr_seq(curRecPtr, record, info);
			break;
		default:
			fprintf(stderr, "Unknown RMID %d.\n", record->xl_rmid);
			break;
	}

	/*
	 * print info about backup blocks.
	 */
	print_backup_blocks(curRecPtr, record);
}
static void
WalSendServerDoRequest(WalSendRequest *walSendRequest)
{
	bool successful;
	struct timeval standbyTimeout;

	WalSendServerGetStandbyTimeout(&standbyTimeout);
	
	switch (walSendRequest->command)
	{
	case PositionToEnd:
		elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "PositionToEnd");

		successful = write_position_to_end(&originalEndLocation,
			                               NULL, &walsend_shutdown_requested);
		if (successful)
			elog(LOG,"Standby master returned transaction log end location %s",
				 XLogLocationToString(&originalEndLocation));
		else
		{
			disableQDMirroring_ConnectionError(
				"Unable to connect to standby master and determine transaction log end location",
				GetStandbyErrorString());
			disconnectMirrorQD_SendClose();
		}
		break;
		
	case Catchup:
		elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Catchup");

        if (isQDMirroringCatchingUp())
    	{
    		bool tooFarBehind = false;

			elog(LOG,"Current master transaction log is flushed through location %s",
				 XLogLocationToString(&walSendRequest->flushedLocation));
			
			if (XLByteLT(originalEndLocation, walSendRequest->flushedLocation))
			{
				/*
				 * Standby master is behind the primary.  Send catchup WAL.
				 */
				 
				/* 
				 * Use a TRY block to catch errors from our attempt to read
				 * the primary's WAL.  Errors from sending to the standby
				 * come up as a boolean return (successful).
				 */
				PG_TRY();
				{
					successful = XLogCatchupQDMirror(
									&originalEndLocation, 
									&walSendRequest->flushedLocation,
									&standbyTimeout,
									&walsend_shutdown_requested);
				}
				PG_CATCH();
				{
					/* 
					 * Report the error related to reading the primary's WAL
					 * to the server log 
					 */
					 
					/* 
					 * But first demote the error to something much less
					 * scary.
					 */
				    if (!elog_demote(WARNING))
			    	{
			    		elog(LOG,"unable to demote error");
			        	PG_RE_THROW();
			    	}
					
					EmitErrorReport();
					FlushErrorState();

					successful = false;
					tooFarBehind = true;
				}
				PG_END_TRY();
				
				if (successful)
				{
					elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
						 "catchup send from standby end %s through primary flushed location %s",
						 XLogLocationToString(&originalEndLocation),
						 XLogLocationToString2(&walSendRequest->flushedLocation));
				}

			}
			else if (XLByteEQ(originalEndLocation, walSendRequest->flushedLocation))
			{
				elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Mirror was already caught up");
				successful = true;
			}
			else
			{
				elog(WARNING,"Standby master transaction log location %s is beyond the current master end location %s",
				     XLogLocationToString(&originalEndLocation),
				     XLogLocationToString2(&walSendRequest->flushedLocation));
				successful = false;
			}
			
			if (successful)
			{
				char detail[200];
				int count;
				
				count = snprintf(
							 detail, sizeof(detail),
							 "Transaction log copied from locations %s through %s to the standby master",
						     XLogLocationToString(&originalEndLocation),
						     XLogLocationToString2(&walSendRequest->flushedLocation));
				if (count >= sizeof(detail))
				{
					ereport(ERROR,
							(errcode(ERRCODE_INTERNAL_ERROR),
							 errmsg("format command string failure")));
				}

				enableQDMirroring("Master mirroring is now synchronized", detail);

				currentEndLocation = walSendRequest->flushedLocation;

				periodicLen = 0;
				periodicLocation = currentEndLocation;
			}
			else
			{
				if (tooFarBehind)
				{
					disableQDMirroring_TooFarBehind(
						"The current master was unable to synchronize the standby master "
						"because the transaction logs on the current master were recycled.  "
						"A gpinitstandby (at an appropriate time) will be necessary to copy "
						"over the whole master database to the standby master so it may be synchronized");
				}
				else
				{
					disableQDMirroring_ConnectionError(
						"Connection to the standby master was lost during transaction log catchup",
						GetStandbyErrorString());
				}
				disconnectMirrorQD_SendClose();
			}
		}
		else if (isQDMirroringDisabled())
		{
			elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not catching-up (state is disabled)");
		}
		else
		{
			elog(ERROR,"unexpected master mirroring state %s",
				 QDMirroringStateString());
		}
		
		break;
		
	case WriteWalPages:
		if (Debug_print_qd_mirroring)
			elog(LOG, "WriteWalPages");
		
        if (isQDMirroringEnabled())
        {
			char	   *from;
			Size		nbytes;
			bool		more= false;

			/*
			 * For now, save copy of data until flush.  This could be
			 * optimized.
			 */
			if (saveBuffer == NULL)
			{
				uint32 totalBufferLen = XLOGbuffers * XLOG_BLCKSZ;
				
				saveBuffer = malloc(totalBufferLen);
				if (saveBuffer == NULL)
					elog(ERROR,"Could not allocate buffer for xlog data (%d bytes)",
					     totalBufferLen);
				
				saveBufferLen = 0;
			}

			XLogGetBuffer(walSendRequest->startidx, walSendRequest->npages,
				          &from, &nbytes);

			if (saveBufferLen == 0)
			{
				more = false;
				writeLogId = walSendRequest->logId;
				writeLogSeg = walSendRequest->logSeg;
				writeLogOff = walSendRequest->logOff;

				memcpy(saveBuffer, from, nbytes);
				saveBufferLen = nbytes;
			}
			else
			{
				more = true;
				memcpy(&saveBuffer[saveBufferLen], from, nbytes);
				saveBufferLen += nbytes;
			}
			
			if (Debug_print_qd_mirroring)
				elog(LOG,
					 "Master Mirror Send: WriteWalPages (%s) startidx %d, npages %d, timeLineID %d, logId %u, logSeg %u, logOff 0x%X, nbytes 0x%X",
					 (more ? "more" : "new"),
					 walSendRequest->startidx,
					 walSendRequest->npages,
					 walSendRequest->timeLineID,
					 walSendRequest->logId,
					 walSendRequest->logSeg,
					 walSendRequest->logOff,
					 (int)nbytes);
    	}

	case FlushWalPages:
		if (Debug_print_qd_mirroring)
			elog(LOG, "FlushWalPages");
		
        if (isQDMirroringEnabled())
        {
			char 		cmd[MAXFNAMELEN + 50];

			if (saveBufferLen == 0)
				successful = true;
			else
			{
				if (snprintf(cmd, sizeof(cmd),"xlog %d %d %d %d", 
							 writeLogId, writeLogSeg, writeLogOff, 
							 (int)saveBufferLen) >= sizeof(cmd))
					elog(ERROR,"could not create cmd for qd mirror logid %d seg %d", 
					     writeLogId, writeLogSeg);
				
				successful = write_qd_sync(cmd, saveBuffer, saveBufferLen, 
							               &standbyTimeout,
							               &walsend_shutdown_requested);
				if (successful)
				{
					XLogRecPtr oldEndLocation;
					
					oldEndLocation = currentEndLocation;

					currentEndLocation.xlogid = writeLogId;
					currentEndLocation.xrecoff = writeLogSeg * XLogSegSize + writeLogOff;
					if (currentEndLocation.xrecoff >= XLogFileSize)
					{
						(currentEndLocation.xlogid)++;
						currentEndLocation.xrecoff = 0;
					}

					if (XLByteLT(oldEndLocation,currentEndLocation))
					{
						periodicLen += saveBufferLen;
						if (periodicLen > periodicReportLen)
						{
							elog(LOG,
								 "Master mirroring periodic report: %d bytes successfully send to standby master for locations %s through %s",
								 periodicLen,
								 XLogLocationToString(&periodicLocation),
								 XLogLocationToString2(&currentEndLocation));

							periodicLen = 0;
							periodicLocation = currentEndLocation;
						}
					}
					else
					{
						if (Debug_print_qd_mirroring)
							elog(LOG,
							     "Send to Master mirror successful.  New end location %s (old %s)",
							     XLogLocationToString(&currentEndLocation),
							     XLogLocationToString2(&oldEndLocation));
					}
				}
				else
				{
					disableQDMirroring_ConnectionError(
						"Connection to the standby master was lost attempting to send new transaction log",
						GetStandbyErrorString());
					disconnectMirrorQD_SendClose();
				}

				/*
				 * Reset so WriteWalPages can fill the buffer again.
				 */
				saveBufferLen = 0;
				writeLogId = 0;
				writeLogSeg = 0;
				writeLogOff = 0;
			}
			
			if (successful && walSendRequest->haveNewCheckpointLocation)
			{
				uint32 logid;
				uint32 seg;
				uint32 offset;
				
	        	elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"New previous checkpoint location %s",
				     XLogLocationToString(&walSendRequest->newCheckpointLocation));
				XLByteToSeg(walSendRequest->newCheckpointLocation, logid, seg);
				offset = walSendRequest->newCheckpointLocation.xrecoff % XLogSegSize;
				
				if (snprintf(cmd, sizeof(cmd),"new_checkpoint_location %d %d %d", 
							 logid, seg, offset) >= sizeof(cmd))
					elog(ERROR,"could not create cmd for qd mirror logid %d seg %d offset %d", 
					     logid, seg, offset);
				
				successful = write_qd_sync(cmd, NULL, 0, 
					                       NULL, &walsend_shutdown_requested);
				if (successful)
				{
					elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Send of new checkpoint location to master mirror successful");
				}
				else
				{
					disableQDMirroring_ConnectionError(
						"Connection to the standby master was lost attempting to send new checkpoint location",
						GetStandbyErrorString());
					disconnectMirrorQD_SendClose();
				}
			}
			
        }
		else if (isQDMirroringDisabled())
		{
			elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not enabled");
		}
		else
		{
			elog(ERROR,"unexpected master mirroring state %s",
				 QDMirroringStateString());
		}
		
		break;

	case CloseForShutdown:
		if (Debug_print_qd_mirroring)
			elog(LOG, "CloseForShutdown");

		/*
		 * Do the work we would normally do when signaled to stop.
		 */
		WalSendServer_ServiceShutdown();
		break;

	default:
		elog(ERROR, "Unknown WalSendRequestCommand %d", walSendRequest->command);
	}

}
Exemple #7
0
static void
gistFindCorrectParent(Relation r, GISTInsertStack *child)
{
	GISTInsertStack *parent = child->parent;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	LockBuffer(parent->buffer, GIST_EXCLUSIVE);
	gistcheckpage(r, parent->buffer);
	parent->page = (Page) BufferGetPage(parent->buffer);

	/* here we don't need to distinguish between split and page update */
	if (parent->childoffnum == InvalidOffsetNumber || !XLByteEQ(parent->lsn, PageGetLSN(parent->page)))
	{
		/* parent is changed, look child in right links until found */
		OffsetNumber i,
					maxoff;
		ItemId		iid;
		IndexTuple	idxtuple;
		GISTInsertStack *ptr;

		while (true)
		{
			maxoff = PageGetMaxOffsetNumber(parent->page);
			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
			{
				iid = PageGetItemId(parent->page, i);
				idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
				if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
				{
					/* yes!!, found */
					parent->childoffnum = i;
					return;
				}
			}

			parent->blkno = GistPageGetOpaque(parent->page)->rightlink;
			UnlockReleaseBuffer(parent->buffer);
			if (parent->blkno == InvalidBlockNumber)

				/*
				 * end of chain and still didn't found parent, It's very-very
				 * rare situation when root splited
				 */
				break;
			parent->buffer = ReadBuffer(r, parent->blkno);
			LockBuffer(parent->buffer, GIST_EXCLUSIVE);
			gistcheckpage(r, parent->buffer);
			parent->page = (Page) BufferGetPage(parent->buffer);
		}

		/*
		 * awful!!, we need search tree to find parent ... , but before we
		 * should release all old parent
		 */

		ptr = child->parent->parent;	/* child->parent already released
										 * above */
		while (ptr)
		{
			ReleaseBuffer(ptr->buffer);
			ptr = ptr->parent;
		}

		/* ok, find new path */
		ptr = parent = gistFindPath(r, child->blkno);
		Assert(ptr != NULL);

		/* read all buffers as expected by caller */
		/* note we don't lock them or gistcheckpage them here! */
		while (ptr)
		{
			ptr->buffer = ReadBuffer(r, ptr->blkno);
			ptr->page = (Page) BufferGetPage(ptr->buffer);
			ptr = ptr->parent;
		}

		/* install new chain of parents to stack */
		child->parent = parent;
		parent->child = child;

		/* make recursive call to normal processing */
		gistFindCorrectParent(r, child);
	}

	return;
}
Exemple #8
0
/*
 * Workhouse routine for doing insertion into a GiST index. Note that
 * this routine assumes it is invoked in a short-lived memory context,
 * so it does not bother releasing palloc'd allocations.
 */
static void
gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
{
	ItemId		iid;
	IndexTuple	idxtuple;
	GISTInsertStack firststack;
	GISTInsertStack *stack;
	GISTInsertState state;
	bool		xlocked = false;

	memset(&state, 0, sizeof(GISTInsertState));
	state.freespace = freespace;
	state.r = r;

	/* Start from the root */
	firststack.blkno = GIST_ROOT_BLKNO;
	firststack.lsn.xrecoff = 0;
	firststack.parent = NULL;
	state.stack = stack = &firststack;

	/*
	 * Walk down along the path of smallest penalty, updating the parent
	 * pointers with the key we're inserting as we go. If we crash in the
	 * middle, the tree is consistent, although the possible parent updates
	 * were a waste.
	 */
	for (;;)
	{
		if (XLogRecPtrIsInvalid(stack->lsn))
			stack->buffer = ReadBuffer(state.r, stack->blkno);

		/*
		 * Be optimistic and grab shared lock first. Swap it for an
		 * exclusive lock later if we need to update the page.
		 */
		if (!xlocked)
		{
			LockBuffer(stack->buffer, GIST_SHARE);
			gistcheckpage(state.r, stack->buffer);
		}

		stack->page = (Page) BufferGetPage(stack->buffer);
		stack->lsn = PageGetLSN(stack->page);
		Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn));

		/*
		 * If this page was split but the downlink was never inserted to
		 * the parent because the inserting backend crashed before doing
		 * that, fix that now.
		 */
		if (GistFollowRight(stack->page))
		{
			if (!xlocked)
			{
				LockBuffer(stack->buffer, GIST_UNLOCK);
				LockBuffer(stack->buffer, GIST_EXCLUSIVE);
				xlocked = true;
				/* someone might've completed the split when we unlocked */
				if (!GistFollowRight(stack->page))
					continue;
			}
			gistfixsplit(&state, giststate);

			UnlockReleaseBuffer(stack->buffer);
			xlocked = false;
			state.stack = stack = stack->parent;
			continue;
		}

		if (stack->blkno != GIST_ROOT_BLKNO &&
			XLByteLT(stack->parent->lsn,
					 GistPageGetOpaque(stack->page)->nsn))
		{
			/*
			 * Concurrent split detected. There's no guarantee that the
			 * downlink for this page is consistent with the tuple we're
			 * inserting anymore, so go back to parent and rechoose the
			 * best child.
			 */
			UnlockReleaseBuffer(stack->buffer);
			xlocked = false;
			state.stack = stack = stack->parent;
			continue;
		}

		if (!GistPageIsLeaf(stack->page))
		{
			/*
			 * This is an internal page so continue to walk down the tree.
			 * Find the child node that has the minimum insertion penalty.
			 */
			BlockNumber childblkno;
			IndexTuple newtup;
			GISTInsertStack *item;

			stack->childoffnum = gistchoose(state.r, stack->page, itup, giststate);
			iid = PageGetItemId(stack->page, stack->childoffnum);
			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
			childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));

			/*
			 * Check that it's not a leftover invalid tuple from pre-9.1
			 */
			if (GistTupleIsInvalid(idxtuple))
				ereport(ERROR,
						(errmsg("index \"%s\" contains an inner tuple marked as invalid",
								RelationGetRelationName(r)),
						 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."),
						 errhint("Please REINDEX it.")));

			/*
			 * Check that the key representing the target child node is
			 * consistent with the key we're inserting. Update it if it's not.
			 */
			newtup = gistgetadjusted(state.r, idxtuple, itup, giststate);
			if (newtup)
			{
				/*
				 * Swap shared lock for an exclusive one. Beware, the page
				 * may change while we unlock/lock the page...
				 */
				if (!xlocked)
				{
					LockBuffer(stack->buffer, GIST_UNLOCK);
					LockBuffer(stack->buffer, GIST_EXCLUSIVE);
					xlocked = true;
					stack->page = (Page) BufferGetPage(stack->buffer);

					if (!XLByteEQ(PageGetLSN(stack->page), stack->lsn))
					{
						/* the page was changed while we unlocked it, retry */
						continue;
					}
				}
				/*
				 * Update the tuple.
				 *
				 * gistinserthere() might have to split the page to make the
				 * updated tuple fit. It will adjust the stack so that after
				 * the call, we'll be holding a lock on the page containing
				 * the tuple, which might have moved right.
				 *
				 * Except if this causes a root split, gistinserthere()
				 * returns 'true'. In that case, stack only holds the new
				 * root, and the child page was released. Have to start
				 * all over.
				 */
				if (gistinserttuples(&state, stack, giststate, &newtup, 1,
									 stack->childoffnum, InvalidBuffer))
				{
					UnlockReleaseBuffer(stack->buffer);
					xlocked = false;
					state.stack = stack = stack->parent;
					continue;
				}
			}
			LockBuffer(stack->buffer, GIST_UNLOCK);
			xlocked = false;

			/* descend to the chosen child */
			item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
			item->blkno = childblkno;
			item->parent = stack;
			state.stack = stack = item;
		}
		else
		{
			/*
			 * Leaf page. Insert the new key. We've already updated all the
			 * parents on the way down, but we might have to split the page
			 * if it doesn't fit. gistinserthere() will take care of that.
			 */

			/*
			 * Swap shared lock for an exclusive one. Be careful, the page
			 * may change while we unlock/lock the page...
			 */
			if (!xlocked)
			{
				LockBuffer(stack->buffer, GIST_UNLOCK);
				LockBuffer(stack->buffer, GIST_EXCLUSIVE);
				xlocked = true;
				stack->page = (Page) BufferGetPage(stack->buffer);
				stack->lsn = PageGetLSN(stack->page);

				if (stack->blkno == GIST_ROOT_BLKNO)
				{
					/*
					 * the only page that can become inner instead of leaf
					 * is the root page, so for root we should recheck it
					 */
					if (!GistPageIsLeaf(stack->page))
					{
						/*
						 * very rare situation: during unlock/lock index with
						 * number of pages = 1 was increased
						 */
						LockBuffer(stack->buffer, GIST_UNLOCK);
						xlocked = false;
						continue;
					}

					/*
					 * we don't need to check root split, because checking
					 * leaf/inner is enough to recognize split for root
					 */
				}
				else if (GistFollowRight(stack->page) ||
						 XLByteLT(stack->parent->lsn,
								  GistPageGetOpaque(stack->page)->nsn))
				{
					/*
					 * The page was split while we momentarily unlocked the
					 * page. Go back to parent.
					 */
					UnlockReleaseBuffer(stack->buffer);
					xlocked = false;
					state.stack = stack = stack->parent;
					continue;
				}
			}

			/* now state.stack->(page, buffer and blkno) points to leaf page */

			gistinserttuples(&state, stack, giststate, &itup, 1,
							 InvalidOffsetNumber, InvalidBuffer);
			LockBuffer(stack->buffer, GIST_UNLOCK);

			/* Release any pins we might still hold before exiting */
			for (; stack; stack = stack->parent)
				ReleaseBuffer(stack->buffer);
			break;
		}
	}
}
void
cdb_perform_redo(XLogRecPtr *redoCheckPointLoc, CheckPoint *redoCheckPoint, XLogRecPtr *newCheckpointLoc)
{
	CheckPoint oldRedoCheckpoint;
	uint32 logid;
	uint32 seg;
	int nsegsremoved;
	
	if (redoCheckPointLoc->xlogid == 0 && redoCheckPointLoc->xrecoff == 0)
	{
		XLogGetRecoveryStart("QDSYNC", "for redo apply", redoCheckPointLoc, redoCheckPoint);
	}
	
	XLogStandbyRecoverRange(redoCheckPointLoc, redoCheckPoint, newCheckpointLoc);

	/*
	 * Sample the recovery start location now to see if appling redo
	 * processed checkpoint records and moved the restart location forward.
	 */
	oldRedoCheckpoint = *redoCheckPoint;

	XLogGetRecoveryStart("QDSYNC", "for redo progress check", redoCheckPointLoc, redoCheckPoint);

	if (XLByteLT(oldRedoCheckpoint.redo,redoCheckPoint->redo))
	{
		ereport(LOG,
		 (errmsg("QDSYNC: transaction redo moved the restart location from %s to %s",
			     XLogLocationToString(&oldRedoCheckpoint.redo),
			     XLogLocationToString2(&redoCheckPoint->redo))));
	}
	else
	{
		Assert(XLByteEQ(oldRedoCheckpoint.redo,redoCheckPoint->redo));
		ereport(LOG,
		 (errmsg("QDSYNC: transaction redo did not move the restart location %s forward this pass",
			     XLogLocationToString(&oldRedoCheckpoint.redo))));
		return;
	}

	XLByteToSeg(redoCheckPoint->redo, logid, seg);
	
	/*
	 * Delete offline log files (those no longer needed even for previous
	 * checkpoint).
	 */
	elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
	     "QDSYNC: keep log files as far back as (logid %d, seg %d)",
		 logid, seg);

	if (logid || seg)
	{
		PrevLogSeg(logid, seg);
		elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
			 "QDSYNC: delete offline log files up to (logid %d, seg %d)",
			 logid, seg);
		
		XLogRemoveStandbyLogs(logid, seg, &nsegsremoved);

		if (nsegsremoved > 0)
		{
		// Throw in extra new line to make log more readable.
			ereport(LOG,
			 (errmsg("QDSYNC: %d logs removed through logid %d, seg %d\n",
				     nsegsremoved,
				     logid, seg)));
		}

	}
	// Throw in extra new line to make log more readable.
	elog(LOG,"--------------------------");
}
Exemple #10
0
/*
 * Update the LSNs on each queue based upon our latest state. This
 * implements a simple policy of first-valid-standby-releases-waiter.
 *
 * Other policies are possible, which would change what we do here and what
 * perhaps also which information we store as well.
 */
void
SyncRepReleaseWaiters(void)
{
	volatile WalSndCtlData *walsndctl = WalSndCtl;
	volatile WalSnd *syncWalSnd = NULL;
	int			numwrite = 0;
	int			numflush = 0;
	int			priority = 0;
	int			i;

	/*
	 * If this WALSender is serving a standby that is not on the list of
	 * potential standbys then we have nothing to do. If we are still starting
	 * up, still running base backup or the current flush position is still
	 * invalid, then leave quickly also.
	 */
	if (MyWalSnd->sync_standby_priority == 0 ||
		MyWalSnd->state < WALSNDSTATE_STREAMING ||
		XLByteEQ(MyWalSnd->flush, InvalidXLogRecPtr))
		return;

	/*
	 * We're a potential sync standby. Release waiters if we are the highest
	 * priority standby. If there are multiple standbys with same priorities
	 * then we use the first mentioned standby. If you change this, also
	 * change pg_stat_get_wal_senders().
	 */
	LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);

	for (i = 0; i < max_wal_senders; i++)
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile WalSnd *walsnd = &walsndctl->walsnds[i];

		if (walsnd->pid != 0 &&
			walsnd->state == WALSNDSTATE_STREAMING &&
			walsnd->sync_standby_priority > 0 &&
			(priority == 0 ||
			 priority > walsnd->sync_standby_priority) &&
			!XLByteEQ(walsnd->flush, InvalidXLogRecPtr))
		{
			priority = walsnd->sync_standby_priority;
			syncWalSnd = walsnd;
		}
	}

	/*
	 * We should have found ourselves at least.
	 */
	Assert(syncWalSnd);

	/*
	 * If we aren't managing the highest priority standby then just leave.
	 */
	if (syncWalSnd != MyWalSnd)
	{
		LWLockRelease(SyncRepLock);
		announce_next_takeover = true;
		return;
	}

	/*
	 * Set the lsn first so that when we wake backends they will release up to
	 * this location.
	 */
	if (XLByteLT(walsndctl->lsn[SYNC_REP_WAIT_WRITE], MyWalSnd->write))
	{
		walsndctl->lsn[SYNC_REP_WAIT_WRITE] = MyWalSnd->write;
		numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE);
	}
	if (XLByteLT(walsndctl->lsn[SYNC_REP_WAIT_FLUSH], MyWalSnd->flush))
	{
		walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = MyWalSnd->flush;
		numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH);
	}

	LWLockRelease(SyncRepLock);

	elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X",
		 numwrite,
		 MyWalSnd->write.xlogid,
		 MyWalSnd->write.xrecoff,
		 numflush,
		 MyWalSnd->flush.xlogid,
		 MyWalSnd->flush.xrecoff);

	/*
	 * If we are managing the highest priority standby, though we weren't
	 * prior to this, then announce we are now the sync standby.
	 */
	if (announce_next_takeover)
	{
		announce_next_takeover = false;
		ereport(LOG,
				(errmsg("standby \"%s\" is now the synchronous standby with priority %u",
						application_name, MyWalSnd->sync_standby_priority)));
	}
}