Esempio n. 1
0
/*
 * Replay the clearing of F_FOLLOW_RIGHT flag.
 */
static void
gistRedoClearFollowRight(RelFileNode node, XLogRecPtr lsn,
						 BlockNumber leftblkno)
{
	Buffer		buffer;

	buffer = XLogReadBuffer(node, leftblkno, false);
	if (BufferIsValid(buffer))
	{
		Page		page = (Page) BufferGetPage(buffer);

		/*
		 * Note that we still update the page even if page LSN is equal to the
		 * LSN of this record, because the updated NSN is not included in the
		 * full page image.
		 */
		if (!XLByteLT(lsn, PageGetLSN(page)))
		{
			GistPageGetOpaque(page)->nsn = lsn;
			GistClearFollowRight(page);

			PageSetLSN(page, lsn);
			PageSetTLI(page, ThisTimeLineID);
			MarkBufferDirty(buffer);
		}
		UnlockReleaseBuffer(buffer);
	}
}
Esempio n. 2
0
/*
 * Insert MyProc into SyncRepQueue, maintaining sorted invariant.
 *
 * Usually we will go at tail of queue, though it's possible that we arrive
 * here out of order, so start at tail and work back to insertion point.
 */
static void
SyncRepQueueInsert(void)
{
	PGPROC	*proc;

	proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue),
								   &(WalSndCtl->SyncRepQueue),
								   offsetof(PGPROC, syncRepLinks));

	while (proc)
	{
		/*
		 * Stop at the queue element that we should after to
		 * ensure the queue is ordered by LSN.
		 */
		if (XLByteLT(proc->waitLSN, MyProc->waitLSN))
			break;

		proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue),
									   &(proc->syncRepLinks),
									   offsetof(PGPROC, syncRepLinks));
	}

	if (proc)
		SHMQueueInsertAfter(&(proc->syncRepLinks), &(MyProc->syncRepLinks));
	else
		SHMQueueInsertAfter(&(WalSndCtl->SyncRepQueue), &(MyProc->syncRepLinks));
}
Esempio n. 3
0
/*
 * GetUndoRecPtr -- returns oldest PGPROC->logRec.
 */
XLogRecPtr
GetUndoRecPtr(void)
{
	SISeg	   *segP = shmInvalBuffer;
	ProcState  *stateP = segP->procState;
	XLogRecPtr	urec = {0, 0};
	XLogRecPtr	tempr;
	int			index;

	LWLockAcquire(SInvalLock, LW_SHARED);

	for (index = 0; index < segP->lastBackend; index++)
	{
		SHMEM_OFFSET pOffset = stateP[index].procStruct;

		if (pOffset != INVALID_OFFSET)
		{
			PGPROC	   *proc = (PGPROC *) MAKE_PTR(pOffset);

			tempr = proc->logRec;
			if (tempr.xrecoff == 0)
				continue;
			if (urec.xrecoff != 0 && XLByteLT(urec, tempr))
				continue;
			urec = tempr;
		}
	}

	LWLockRelease(SInvalLock);

	return (urec);
}
Esempio n. 4
0
/*
 *	visibilitymap_set - set a bit on a previously pinned page
 *
 * recptr is the LSN of the heap page. The LSN of the visibility map page is
 * advanced to that, to make sure that the visibility map doesn't get flushed
 * to disk before the update to the heap page that made all tuples visible.
 *
 * This is an opportunistic function. It does nothing, unless *buf
 * contains the bit for heapBlk. Call visibilitymap_pin first to pin
 * the right map page. This function doesn't do any I/O.
 */
void
visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
				  Buffer *buf)
{
	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
	uint8		mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
	Page		page;
	char	   *map;

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
#endif

	/* Check that we have the right page pinned */
	if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != mapBlock)
		return;

	page = BufferGetPage(*buf);
	map = PageGetContents(page);
	LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);

	if (!(map[mapByte] & (1 << mapBit)))
	{
		map[mapByte] |= (1 << mapBit);

		if (XLByteLT(PageGetLSN(page), recptr))
			PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		MarkBufferDirty(*buf);
	}

	LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
}
Esempio n. 5
0
/*
 * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant.
 *
 * Usually we will go at tail of queue, though it's possible that we arrive
 * here out of order, so start at tail and work back to insertion point.
 */
static void
SyncRepQueueInsert(int mode)
{
	PGPROC	   *proc;

	Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
	proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
								   &(WalSndCtl->SyncRepQueue[mode]),
								   offsetof(PGPROC, syncRepLinks));

	while (proc)
	{
		/*
		 * Stop at the queue element that we should after to ensure the queue
		 * is ordered by LSN.
		 */
		if (XLByteLT(proc->waitLSN, MyProc->waitLSN))
			break;

		proc = (PGPROC *) SHMQueuePrev(&(WalSndCtl->SyncRepQueue[mode]),
									   &(proc->syncRepLinks),
									   offsetof(PGPROC, syncRepLinks));
	}

	if (proc)
		SHMQueueInsertAfter(&(proc->syncRepLinks), &(MyProc->syncRepLinks));
	else
		SHMQueueInsertAfter(&(WalSndCtl->SyncRepQueue[mode]), &(MyProc->syncRepLinks));
}
Esempio n. 6
0
/*
 * Save a new checkpoint location that the QD mirror can safely reply
 * through and release any XLOG files freed up by reply.
 *
 * The location will be saved and sent to the QD mirror on the WAL send
 * request.
 */
void
FtsQDMirroringNewCheckpointLoc(XLogRecPtr *newCheckpointLocation)
{
	Assert(newCheckpointLocation != NULL);

	LWLockAcquire(ftsQDMirrorLock, LW_EXCLUSIVE);

	if (ftsQDMirrorInfo->state != QDMIRROR_STATE_SYNCHRONIZED)
	{
		LWLockRelease(ftsQDMirrorLock);
		return;
	}

	if (ftsQDMirrorInfo->haveNewCheckpointLocation &&
		XLByteLT(*newCheckpointLocation, ftsQDMirrorInfo->newCheckpointLocation))
	{
		LWLockRelease(ftsQDMirrorLock);
		return;
	}

	ftsQDMirrorInfo->haveNewCheckpointLocation = true;
	ftsQDMirrorInfo->newCheckpointLocation = *newCheckpointLocation;

	LWLockRelease(ftsQDMirrorLock);
}
Esempio n. 7
0
/* Flush the log to disk */
static void
XLogWalRcvFlush(void)
{
	if (XLByteLT(LogstreamResult.Flush, LogstreamResult.Write))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile WalRcvData *walrcv = WalRcv;

		issue_xlog_fsync(recvFile, recvId, recvSeg);

		LogstreamResult.Flush = LogstreamResult.Write;

		/* Update shared-memory status */
		SpinLockAcquire(&walrcv->mutex);
		walrcv->latestChunkStart = walrcv->receivedUpto;
		walrcv->receivedUpto = LogstreamResult.Flush;
		SpinLockRelease(&walrcv->mutex);

		/* Report XLOG streaming progress in PS display */
		if (update_process_title)
		{
			char		activitymsg[50];

			snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
					 LogstreamResult.Write.xlogid,
					 LogstreamResult.Write.xrecoff);
			set_ps_display(activitymsg, false);
		}
	}
}
Esempio n. 8
0
/*
 * Returns the oldest Send position among walsenders. Or InvalidXLogRecPtr
 * if none.
 */
XLogRecPtr
GetOldestWALSendPointer(void)
{
	XLogRecPtr	oldest = {0, 0};
	int			i;
	bool		found = false;

	for (i = 0; i < max_wal_senders; i++)
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile WalSnd *walsnd = &WalSndCtl->walsnds[i];
		XLogRecPtr	recptr;

		if (walsnd->pid == 0)
			continue;

		SpinLockAcquire(&walsnd->mutex);
		recptr = walsnd->sentPtr;
		SpinLockRelease(&walsnd->mutex);

		if (recptr.xlogid == 0 && recptr.xrecoff == 0)
			continue;

		if (!found || XLByteLT(recptr, oldest))
			oldest = recptr;
		found = true;
	}
	return oldest;
}
Esempio n. 9
0
/*
 * Flush the log to disk.
 *
 * If we're in the midst of dying, it's unwise to do anything that might throw
 * an error, so we skip sending a reply in that case.
 */
static void
XLogWalRcvFlush(bool dying)
{
	if (XLByteLT(LogstreamResult.Flush, LogstreamResult.Write))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile WalRcvData *walrcv = WalRcv;

		issue_xlog_fsync(recvFile, recvId, recvSeg);

		LogstreamResult.Flush = LogstreamResult.Write;

		/* Update shared-memory status */
		SpinLockAcquire(&walrcv->mutex);
		if (XLByteLT(walrcv->receivedUpto, LogstreamResult.Flush))
		{
			walrcv->latestChunkStart = walrcv->receivedUpto;
			walrcv->receivedUpto = LogstreamResult.Flush;
		}
		SpinLockRelease(&walrcv->mutex);

		/* Signal the startup process and walsender that new WAL has arrived */
		WakeupRecovery();
		if (AllowCascadeReplication())
			WalSndWakeup();

		/* Report XLOG streaming progress in PS display */
		if (update_process_title)
		{
			char		activitymsg[50];

			snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
					 LogstreamResult.Write.xlogid,
					 LogstreamResult.Write.xrecoff);
			set_ps_display(activitymsg, false);
		}

		/* Also let the master know that we made some progress */
		if (!dying)
		{
			XLogWalRcvSendReply();
			XLogWalRcvSendHSFeedback();
		}
	}
}
Esempio n. 10
0
static void
bitmap_xlog_insert_bitmap_lastwords(bool redo, XLogRecPtr lsn, XLogRecord* record)
{
	xl_bm_bitmap_lastwords	*xlrec = 
		(xl_bm_bitmap_lastwords*) XLogRecGetData(record);
	Relation reln;

	reln = XLogOpenRelation(xlrec->bm_node);
	if (!RelationIsValid(reln))
		return;

	if (redo)
	{
		Buffer		lovBuffer;
		Page		lovPage;
		BMLOVItem	lovItem;

#ifdef BM_DEBUG
		ereport(LOG, (errcode(LOG), 
			errmsg("call bitmap_xlog_insert_bitmap_lastwords: redo=%d\n", 
					redo)));
#endif

		lovBuffer = XLogReadBuffer(false, reln, xlrec->bm_lov_blkno);
		if (!BufferIsValid(lovBuffer))
			elog(PANIC, "bm_insert_redo: block unfound: %d",
				 xlrec->bm_lov_blkno);

		lovPage = BufferGetPage(lovBuffer);

		if (XLByteLT(PageGetLSN(lovPage), lsn))
		{
			lovItem = (BMLOVItem)
				PageGetItem(lovPage, PageGetItemId(lovPage, xlrec->bm_lov_offset));

			lovItem->bm_last_compword = xlrec->bm_last_compword;
			lovItem->bm_last_word = xlrec->bm_last_word;
			lovItem->bm_last_two_headerbits = xlrec->bm_last_two_headerbits;

			PageSetLSN(lovPage, lsn);
			PageSetTLI(lovPage, ThisTimeLineID);
			_bitmap_wrtbuf(lovBuffer);
		}

		else
			_bitmap_relbuf(lovBuffer);
	}

	else
		elog(PANIC, "bm_insert_undo: not implemented.");
}
Esempio n. 11
0
static void
pushStackIfSplited(Page page, GistBDItem *stack)
{
	GISTPageOpaque opaque = GistPageGetOpaque(page);

	if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) &&
		XLByteLT(stack->parentlsn, opaque->nsn) &&
		opaque->rightlink != InvalidBlockNumber /* sanity check */ )
	{
		/* split page detected, install right link to the stack */

		GistBDItem *ptr = (GistBDItem *) palloc(sizeof(GistBDItem));

		ptr->blkno = opaque->rightlink;
		ptr->parentlsn = stack->parentlsn;
		ptr->next = stack->next;
		stack->next = ptr;
	}
}
Esempio n. 12
0
static void
bitmap_xlog_insert_meta(bool redo, XLogRecPtr lsn, XLogRecord* record)
{
	xl_bm_metapage	*xlrec = (xl_bm_metapage*) XLogRecGetData(record);
	Relation		reln;

	reln = XLogOpenRelation(xlrec->bm_node);
	
	if (!RelationIsValid(reln))
		return;

	if (redo)
	{
		Buffer			metabuf;
		BMMetaPage			metapage;

#ifdef BM_DEBUG
		ereport(LOG, (errcode(LOG), 
			errmsg("call bitmap_xlog_insert_meta: redo=%d\n", redo)));
#endif

		metabuf = XLogReadBuffer(false, reln, BM_METAPAGE);
		if (!BufferIsValid(metabuf))
			elog(PANIC, "bm_insert_redo: block unfound: %d", BM_METAPAGE);

		/* restore the page */
		metapage = (BMMetaPage)BufferGetPage(metabuf);

		if (XLByteLT(PageGetLSN(metapage), lsn))
		{
			PageSetLSN(metapage, lsn);
			PageSetTLI(metapage, ThisTimeLineID);
			_bitmap_wrtbuf(metabuf);
		}

		else
			_bitmap_relbuf(metabuf);
	}

	else
		elog(PANIC, "bm_insert_undo: not implemented.");
}
Esempio n. 13
0
/*
 * Scan all items on the GiST index page identified by *pageItem, and insert
 * them into the queue (or directly to output areas)
 *
 * scan: index scan we are executing
 * pageItem: search queue item identifying an index page to scan
 * myDistances: distances array associated with pageItem, or NULL at the root
 * tbm: if not NULL, gistgetbitmap's output bitmap
 * ntids: if not NULL, gistgetbitmap's output tuple counter
 *
 * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap
 * tuples should be reported directly into the bitmap.	If they are NULL,
 * we're doing a plain or ordered indexscan.  For a plain indexscan, heap
 * tuple TIDs are returned into so->pageData[].  For an ordered indexscan,
 * heap tuple TIDs are pushed into individual search queue items.
 *
 * If we detect that the index page has split since we saw its downlink
 * in the parent, we push its new right sibling onto the queue so the
 * sibling will be processed next.
 */
static void
gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
			 TIDBitmap *tbm, int64 *ntids)
{
	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
	Buffer		buffer;
	Page		page;
	GISTPageOpaque opaque;
	OffsetNumber maxoff;
	OffsetNumber i;
	GISTSearchTreeItem *tmpItem = so->tmpTreeItem;
	bool		isNew;
	MemoryContext oldcxt;

	Assert(!GISTSearchItemIsHeap(*pageItem));

	buffer = ReadBuffer(scan->indexRelation, pageItem->blkno);
	LockBuffer(buffer, GIST_SHARE);
	gistcheckpage(scan->indexRelation, buffer);
	page = BufferGetPage(buffer);
	opaque = GistPageGetOpaque(page);

	/*
	 * Check if we need to follow the rightlink. We need to follow it if the
	 * page was concurrently split since we visited the parent (in which case
	 * parentlsn < nsn), or if the the system crashed after a page split but
	 * before the downlink was inserted into the parent.
	 */
	if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) &&
		(GistFollowRight(page) ||
		 XLByteLT(pageItem->data.parentlsn, opaque->nsn)) &&
		opaque->rightlink != InvalidBlockNumber /* sanity check */ )
	{
		/* There was a page split, follow right link to add pages */
		GISTSearchItem *item;

		/* This can't happen when starting at the root */
		Assert(myDistances != NULL);

		oldcxt = MemoryContextSwitchTo(so->queueCxt);

		/* Create new GISTSearchItem for the right sibling index page */
		item = palloc(sizeof(GISTSearchItem));
		item->next = NULL;
		item->blkno = opaque->rightlink;
		item->data.parentlsn = pageItem->data.parentlsn;

		/* Insert it into the queue using same distances as for this page */
		tmpItem->head = item;
		tmpItem->lastHeap = NULL;
		memcpy(tmpItem->distances, myDistances,
			   sizeof(double) * scan->numberOfOrderBys);

		(void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);

		MemoryContextSwitchTo(oldcxt);
	}

	so->nPageData = so->curPageData = 0;

	/*
	 * check all tuples on page
	 */
	maxoff = PageGetMaxOffsetNumber(page);
	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
	{
		IndexTuple	it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
		bool		match;
		bool		recheck;

		/*
		 * Must call gistindex_keytest in tempCxt, and clean up any leftover
		 * junk afterward.
		 */
		oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt);

		match = gistindex_keytest(scan, it, page, i, &recheck);

		MemoryContextSwitchTo(oldcxt);
		MemoryContextReset(so->giststate->tempCxt);

		/* Ignore tuple if it doesn't match */
		if (!match)
			continue;

		if (tbm && GistPageIsLeaf(page))
		{
			/*
			 * getbitmap scan, so just push heap tuple TIDs into the bitmap
			 * without worrying about ordering
			 */
			tbm_add_tuples(tbm, &it->t_tid, 1, recheck);
			(*ntids)++;
		}
		else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page))
		{
			/*
			 * Non-ordered scan, so report heap tuples in so->pageData[]
			 */
			so->pageData[so->nPageData].heapPtr = it->t_tid;
			so->pageData[so->nPageData].recheck = recheck;
			so->nPageData++;
		}
		else
		{
			/*
			 * Must push item into search queue.  We get here for any lower
			 * index page, and also for heap tuples if doing an ordered
			 * search.
			 */
			GISTSearchItem *item;

			oldcxt = MemoryContextSwitchTo(so->queueCxt);

			/* Create new GISTSearchItem for this item */
			item = palloc(sizeof(GISTSearchItem));
			item->next = NULL;

			if (GistPageIsLeaf(page))
			{
				/* Creating heap-tuple GISTSearchItem */
				item->blkno = InvalidBlockNumber;
				item->data.heap.heapPtr = it->t_tid;
				item->data.heap.recheck = recheck;
			}
			else
			{
				/* Creating index-page GISTSearchItem */
				item->blkno = ItemPointerGetBlockNumber(&it->t_tid);
				/* lsn of current page is lsn of parent page for child */
				item->data.parentlsn = PageGetLSN(page);
			}

			/* Insert it into the queue using new distance data */
			tmpItem->head = item;
			tmpItem->lastHeap = GISTSearchItemIsHeap(*item) ? item : NULL;
			memcpy(tmpItem->distances, so->distances,
				   sizeof(double) * scan->numberOfOrderBys);

			(void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);

			MemoryContextSwitchTo(oldcxt);
		}
	}

	UnlockReleaseBuffer(buffer);
}
Esempio n. 14
0
/*
 * Physical write of a page from a buffer slot
 *
 * On failure, we cannot just ereport(ERROR) since caller has put state in
 * shared memory that must be undone.  So, we return FALSE and save enough
 * info in static variables to let SlruReportIOError make the report.
 *
 * For now, assume it's not worth keeping a file pointer open across
 * independent read/write operations.  We do batch operations during
 * SimpleLruFlush, though.
 *
 * fdata is NULL for a standalone write, pointer to open-file info during
 * SimpleLruFlush.
 */
static bool
SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
{
	SlruShared	shared = ctl->shared;
	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
	int			offset = rpageno * BLCKSZ;
	char		path[MAXPGPATH];
	int			fd = -1;
	struct timeval tv;

	/*
	 * Honor the write-WAL-before-data rule, if appropriate, so that we do not
	 * write out data before associated WAL records.  This is the same action
	 * performed during FlushBuffer() in the main buffer manager.
	 */
	if (shared->group_lsn != NULL)
	{
		/*
		 * We must determine the largest async-commit LSN for the page. This
		 * is a bit tedious, but since this entire function is a slow path
		 * anyway, it seems better to do this here than to maintain a per-page
		 * LSN variable (which'd need an extra comparison in the
		 * transaction-commit path).
		 */
		XLogRecPtr	max_lsn;
		int			lsnindex,
					lsnoff;

		lsnindex = slotno * shared->lsn_groups_per_page;
		max_lsn = shared->group_lsn[lsnindex++];
		for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
		{
			XLogRecPtr	this_lsn = shared->group_lsn[lsnindex++];

			if (XLByteLT(max_lsn, this_lsn))
				max_lsn = this_lsn;
		}

		if (!XLogRecPtrIsInvalid(max_lsn))
		{
			/*
			 * As noted above, elog(ERROR) is not acceptable here, so if
			 * XLogFlush were to fail, we must PANIC.  This isn't much of a
			 * restriction because XLogFlush is just about all critical
			 * section anyway, but let's make sure.
			 */
			START_CRIT_SECTION();
			XLogFlush(max_lsn);
			END_CRIT_SECTION();
		}
	}

	/*
	 * During a Flush, we may already have the desired file open.
	 */
	if (fdata)
	{
		int			i;

		for (i = 0; i < fdata->num_files; i++)
		{
			if (fdata->segno[i] == segno)
			{
				fd = fdata->fd[i];
				break;
			}
		}
	}

	if (fd < 0)
	{
		/*
		 * If the file doesn't already exist, we should create it.  It is
		 * possible for this to need to happen when writing a page that's not
		 * first in its segment; we assume the OS can cope with that. (Note:
		 * it might seem that it'd be okay to create files only when
		 * SimpleLruZeroPage is called for the first page of a segment.
		 * However, if after a crash and restart the REDO logic elects to
		 * replay the log from a checkpoint before the latest one, then it's
		 * possible that we will get commands to set transaction status of
		 * transactions that have already been truncated from the commit log.
		 * Easiest way to deal with that is to accept references to
		 * nonexistent files here and in SlruPhysicalReadPage.)
		 *
		 * Note: it is possible for more than one backend to be executing this
		 * code simultaneously for different pages of the same file. Hence,
		 * don't use O_EXCL or O_TRUNC or anything like that.
		 */
		SlruFileName(ctl, path, segno);
		fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY,
						   S_IRUSR | S_IWUSR);
		if (fd < 0)
		{
			slru_errcause = SLRU_OPEN_FAILED;
			slru_errno = errno;
			return false;
		}

		if (fdata)
		{
			if (fdata->num_files < MAX_FLUSH_BUFFERS)
			{
				fdata->fd[fdata->num_files] = fd;
				fdata->segno[fdata->num_files] = segno;
				fdata->num_files++;
			}
			else
			{
				/*
				 * In the unlikely event that we exceed MAX_FLUSH_BUFFERS,
				 * fall back to treating it as a standalone write.
				 */
				fdata = NULL;
			}
		}
	}

	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
	{
		slru_errcause = SLRU_SEEK_FAILED;
		slru_errno = errno;
		if (!fdata)
			close(fd);
		return false;
	}

	errno = 0;
	if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
		slru_errcause = SLRU_WRITE_FAILED;
		slru_errno = errno;
		if (!fdata)
			close(fd);
		return false;
	}
#ifdef XP_TRACE_LRU_WRITE
	gettimeofday(&tv, NULL);
	ereport(TRACE_LEVEL,
		(errmsg("%ld.%ld:\tWRITE:\tSlruPhysicalWritePage:\tfile:%s",
				tv.tv_sec, tv.tv_usec, path)));
#endif

	/*
	 * If not part of Flush, need to fsync now.  We assume this happens
	 * infrequently enough that it's not a performance issue.
	 */
	if (!fdata)
	{
		if (ctl->do_fsync && pg_fsync(fd))
		{
			slru_errcause = SLRU_FSYNC_FAILED;
			slru_errno = errno;
			close(fd);
			return false;
		}

		if (close(fd))
		{
			slru_errcause = SLRU_CLOSE_FAILED;
			slru_errno = errno;
			return false;
		}
	}

	return true;
}
Esempio n. 15
0
void
cdb_perform_redo(XLogRecPtr *redoCheckPointLoc, CheckPoint *redoCheckPoint, XLogRecPtr *newCheckpointLoc)
{
	CheckPoint oldRedoCheckpoint;
	uint32 logid;
	uint32 seg;
	int nsegsremoved;
	
	if (redoCheckPointLoc->xlogid == 0 && redoCheckPointLoc->xrecoff == 0)
	{
		XLogGetRecoveryStart("QDSYNC", "for redo apply", redoCheckPointLoc, redoCheckPoint);
	}
	
	XLogStandbyRecoverRange(redoCheckPointLoc, redoCheckPoint, newCheckpointLoc);

	/*
	 * Sample the recovery start location now to see if appling redo
	 * processed checkpoint records and moved the restart location forward.
	 */
	oldRedoCheckpoint = *redoCheckPoint;

	XLogGetRecoveryStart("QDSYNC", "for redo progress check", redoCheckPointLoc, redoCheckPoint);

	if (XLByteLT(oldRedoCheckpoint.redo,redoCheckPoint->redo))
	{
		ereport(LOG,
		 (errmsg("QDSYNC: transaction redo moved the restart location from %s to %s",
			     XLogLocationToString(&oldRedoCheckpoint.redo),
			     XLogLocationToString2(&redoCheckPoint->redo))));
	}
	else
	{
		Assert(XLByteEQ(oldRedoCheckpoint.redo,redoCheckPoint->redo));
		ereport(LOG,
		 (errmsg("QDSYNC: transaction redo did not move the restart location %s forward this pass",
			     XLogLocationToString(&oldRedoCheckpoint.redo))));
		return;
	}

	XLByteToSeg(redoCheckPoint->redo, logid, seg);
	
	/*
	 * Delete offline log files (those no longer needed even for previous
	 * checkpoint).
	 */
	elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
	     "QDSYNC: keep log files as far back as (logid %d, seg %d)",
		 logid, seg);

	if (logid || seg)
	{
		PrevLogSeg(logid, seg);
		elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
			 "QDSYNC: delete offline log files up to (logid %d, seg %d)",
			 logid, seg);
		
		XLogRemoveStandbyLogs(logid, seg, &nsegsremoved);

		if (nsegsremoved > 0)
		{
		// Throw in extra new line to make log more readable.
			ereport(LOG,
			 (errmsg("QDSYNC: %d logs removed through logid %d, seg %d\n",
				     nsegsremoved,
				     logid, seg)));
		}

	}
	// Throw in extra new line to make log more readable.
	elog(LOG,"--------------------------");
}
Esempio n. 16
0
static void
WalSendServerDoRequest(WalSendRequest *walSendRequest)
{
	bool successful;
	struct timeval standbyTimeout;

	WalSendServerGetStandbyTimeout(&standbyTimeout);
	
	switch (walSendRequest->command)
	{
	case PositionToEnd:
		elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "PositionToEnd");

		successful = write_position_to_end(&originalEndLocation,
			                               NULL, &walsend_shutdown_requested);
		if (successful)
			elog(LOG,"Standby master returned transaction log end location %s",
				 XLogLocationToString(&originalEndLocation));
		else
		{
			disableQDMirroring_ConnectionError(
				"Unable to connect to standby master and determine transaction log end location",
				GetStandbyErrorString());
			disconnectMirrorQD_SendClose();
		}
		break;
		
	case Catchup:
		elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Catchup");

        if (isQDMirroringCatchingUp())
    	{
    		bool tooFarBehind = false;

			elog(LOG,"Current master transaction log is flushed through location %s",
				 XLogLocationToString(&walSendRequest->flushedLocation));
			
			if (XLByteLT(originalEndLocation, walSendRequest->flushedLocation))
			{
				/*
				 * Standby master is behind the primary.  Send catchup WAL.
				 */
				 
				/* 
				 * Use a TRY block to catch errors from our attempt to read
				 * the primary's WAL.  Errors from sending to the standby
				 * come up as a boolean return (successful).
				 */
				PG_TRY();
				{
					successful = XLogCatchupQDMirror(
									&originalEndLocation, 
									&walSendRequest->flushedLocation,
									&standbyTimeout,
									&walsend_shutdown_requested);
				}
				PG_CATCH();
				{
					/* 
					 * Report the error related to reading the primary's WAL
					 * to the server log 
					 */
					 
					/* 
					 * But first demote the error to something much less
					 * scary.
					 */
				    if (!elog_demote(WARNING))
			    	{
			    		elog(LOG,"unable to demote error");
			        	PG_RE_THROW();
			    	}
					
					EmitErrorReport();
					FlushErrorState();

					successful = false;
					tooFarBehind = true;
				}
				PG_END_TRY();
				
				if (successful)
				{
					elog((Debug_print_qd_mirroring ? LOG : DEBUG5),
						 "catchup send from standby end %s through primary flushed location %s",
						 XLogLocationToString(&originalEndLocation),
						 XLogLocationToString2(&walSendRequest->flushedLocation));
				}

			}
			else if (XLByteEQ(originalEndLocation, walSendRequest->flushedLocation))
			{
				elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Mirror was already caught up");
				successful = true;
			}
			else
			{
				elog(WARNING,"Standby master transaction log location %s is beyond the current master end location %s",
				     XLogLocationToString(&originalEndLocation),
				     XLogLocationToString2(&walSendRequest->flushedLocation));
				successful = false;
			}
			
			if (successful)
			{
				char detail[200];
				int count;
				
				count = snprintf(
							 detail, sizeof(detail),
							 "Transaction log copied from locations %s through %s to the standby master",
						     XLogLocationToString(&originalEndLocation),
						     XLogLocationToString2(&walSendRequest->flushedLocation));
				if (count >= sizeof(detail))
				{
					ereport(ERROR,
							(errcode(ERRCODE_INTERNAL_ERROR),
							 errmsg("format command string failure")));
				}

				enableQDMirroring("Master mirroring is now synchronized", detail);

				currentEndLocation = walSendRequest->flushedLocation;

				periodicLen = 0;
				periodicLocation = currentEndLocation;
			}
			else
			{
				if (tooFarBehind)
				{
					disableQDMirroring_TooFarBehind(
						"The current master was unable to synchronize the standby master "
						"because the transaction logs on the current master were recycled.  "
						"A gpinitstandby (at an appropriate time) will be necessary to copy "
						"over the whole master database to the standby master so it may be synchronized");
				}
				else
				{
					disableQDMirroring_ConnectionError(
						"Connection to the standby master was lost during transaction log catchup",
						GetStandbyErrorString());
				}
				disconnectMirrorQD_SendClose();
			}
		}
		else if (isQDMirroringDisabled())
		{
			elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not catching-up (state is disabled)");
		}
		else
		{
			elog(ERROR,"unexpected master mirroring state %s",
				 QDMirroringStateString());
		}
		
		break;
		
	case WriteWalPages:
		if (Debug_print_qd_mirroring)
			elog(LOG, "WriteWalPages");
		
        if (isQDMirroringEnabled())
        {
			char	   *from;
			Size		nbytes;
			bool		more= false;

			/*
			 * For now, save copy of data until flush.  This could be
			 * optimized.
			 */
			if (saveBuffer == NULL)
			{
				uint32 totalBufferLen = XLOGbuffers * XLOG_BLCKSZ;
				
				saveBuffer = malloc(totalBufferLen);
				if (saveBuffer == NULL)
					elog(ERROR,"Could not allocate buffer for xlog data (%d bytes)",
					     totalBufferLen);
				
				saveBufferLen = 0;
			}

			XLogGetBuffer(walSendRequest->startidx, walSendRequest->npages,
				          &from, &nbytes);

			if (saveBufferLen == 0)
			{
				more = false;
				writeLogId = walSendRequest->logId;
				writeLogSeg = walSendRequest->logSeg;
				writeLogOff = walSendRequest->logOff;

				memcpy(saveBuffer, from, nbytes);
				saveBufferLen = nbytes;
			}
			else
			{
				more = true;
				memcpy(&saveBuffer[saveBufferLen], from, nbytes);
				saveBufferLen += nbytes;
			}
			
			if (Debug_print_qd_mirroring)
				elog(LOG,
					 "Master Mirror Send: WriteWalPages (%s) startidx %d, npages %d, timeLineID %d, logId %u, logSeg %u, logOff 0x%X, nbytes 0x%X",
					 (more ? "more" : "new"),
					 walSendRequest->startidx,
					 walSendRequest->npages,
					 walSendRequest->timeLineID,
					 walSendRequest->logId,
					 walSendRequest->logSeg,
					 walSendRequest->logOff,
					 (int)nbytes);
    	}

	case FlushWalPages:
		if (Debug_print_qd_mirroring)
			elog(LOG, "FlushWalPages");
		
        if (isQDMirroringEnabled())
        {
			char 		cmd[MAXFNAMELEN + 50];

			if (saveBufferLen == 0)
				successful = true;
			else
			{
				if (snprintf(cmd, sizeof(cmd),"xlog %d %d %d %d", 
							 writeLogId, writeLogSeg, writeLogOff, 
							 (int)saveBufferLen) >= sizeof(cmd))
					elog(ERROR,"could not create cmd for qd mirror logid %d seg %d", 
					     writeLogId, writeLogSeg);
				
				successful = write_qd_sync(cmd, saveBuffer, saveBufferLen, 
							               &standbyTimeout,
							               &walsend_shutdown_requested);
				if (successful)
				{
					XLogRecPtr oldEndLocation;
					
					oldEndLocation = currentEndLocation;

					currentEndLocation.xlogid = writeLogId;
					currentEndLocation.xrecoff = writeLogSeg * XLogSegSize + writeLogOff;
					if (currentEndLocation.xrecoff >= XLogFileSize)
					{
						(currentEndLocation.xlogid)++;
						currentEndLocation.xrecoff = 0;
					}

					if (XLByteLT(oldEndLocation,currentEndLocation))
					{
						periodicLen += saveBufferLen;
						if (periodicLen > periodicReportLen)
						{
							elog(LOG,
								 "Master mirroring periodic report: %d bytes successfully send to standby master for locations %s through %s",
								 periodicLen,
								 XLogLocationToString(&periodicLocation),
								 XLogLocationToString2(&currentEndLocation));

							periodicLen = 0;
							periodicLocation = currentEndLocation;
						}
					}
					else
					{
						if (Debug_print_qd_mirroring)
							elog(LOG,
							     "Send to Master mirror successful.  New end location %s (old %s)",
							     XLogLocationToString(&currentEndLocation),
							     XLogLocationToString2(&oldEndLocation));
					}
				}
				else
				{
					disableQDMirroring_ConnectionError(
						"Connection to the standby master was lost attempting to send new transaction log",
						GetStandbyErrorString());
					disconnectMirrorQD_SendClose();
				}

				/*
				 * Reset so WriteWalPages can fill the buffer again.
				 */
				saveBufferLen = 0;
				writeLogId = 0;
				writeLogSeg = 0;
				writeLogOff = 0;
			}
			
			if (successful && walSendRequest->haveNewCheckpointLocation)
			{
				uint32 logid;
				uint32 seg;
				uint32 offset;
				
	        	elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"New previous checkpoint location %s",
				     XLogLocationToString(&walSendRequest->newCheckpointLocation));
				XLByteToSeg(walSendRequest->newCheckpointLocation, logid, seg);
				offset = walSendRequest->newCheckpointLocation.xrecoff % XLogSegSize;
				
				if (snprintf(cmd, sizeof(cmd),"new_checkpoint_location %d %d %d", 
							 logid, seg, offset) >= sizeof(cmd))
					elog(ERROR,"could not create cmd for qd mirror logid %d seg %d offset %d", 
					     logid, seg, offset);
				
				successful = write_qd_sync(cmd, NULL, 0, 
					                       NULL, &walsend_shutdown_requested);
				if (successful)
				{
					elog((Debug_print_qd_mirroring ? LOG : DEBUG5),"Send of new checkpoint location to master mirror successful");
				}
				else
				{
					disableQDMirroring_ConnectionError(
						"Connection to the standby master was lost attempting to send new checkpoint location",
						GetStandbyErrorString());
					disconnectMirrorQD_SendClose();
				}
			}
			
        }
		else if (isQDMirroringDisabled())
		{
			elog((Debug_print_qd_mirroring ? LOG : DEBUG5), "Master Mirror Send: Master mirroring not enabled");
		}
		else
		{
			elog(ERROR,"unexpected master mirroring state %s",
				 QDMirroringStateString());
		}
		
		break;

	case CloseForShutdown:
		if (Debug_print_qd_mirroring)
			elog(LOG, "CloseForShutdown");

		/*
		 * Do the work we would normally do when signaled to stop.
		 */
		WalSendServer_ServiceShutdown();
		break;

	default:
		elog(ERROR, "Unknown WalSendRequestCommand %d", walSendRequest->command);
	}

}
Esempio n. 17
0
/*
 * Traverse the tree to find path from root page to specified "child" block.
 *
 * returns from the beginning of closest parent;
 *
 * To prevent deadlocks, this should lock only one page simultaneously.
 */
GISTInsertStack *
gistFindPath(Relation r, BlockNumber child)
{
	Page		page;
	Buffer		buffer;
	OffsetNumber i,
				maxoff;
	ItemId		iid;
	IndexTuple	idxtuple;
	GISTInsertStack *top,
			   *tail,
			   *ptr;
	BlockNumber blkno;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	top = tail = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
	top->blkno = GIST_ROOT_BLKNO;

	while (top && top->blkno != child)
	{
		buffer = ReadBuffer(r, top->blkno);
		LockBuffer(buffer, GIST_SHARE);
		gistcheckpage(r, buffer);
		page = (Page) BufferGetPage(buffer);

		if (GistPageIsLeaf(page))
		{
			/* we can safety go away, follows only leaf pages */
			UnlockReleaseBuffer(buffer);
			return NULL;
		}

		top->lsn = PageGetLSN(page);

		if (top->parent && XLByteLT(top->parent->lsn, GistPageGetOpaque(page)->nsn) &&
			GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ )
		{
			/* page splited while we thinking of... */
			ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
			ptr->blkno = GistPageGetOpaque(page)->rightlink;
			ptr->childoffnum = InvalidOffsetNumber;
			ptr->parent = top;
			ptr->next = NULL;
			tail->next = ptr;
			tail = ptr;
		}

		maxoff = PageGetMaxOffsetNumber(page);

		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
		{
			iid = PageGetItemId(page, i);
			idxtuple = (IndexTuple) PageGetItem(page, iid);
			blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
			if (blkno == child)
			{
				OffsetNumber poff = InvalidOffsetNumber;

				/* make childs links */
				ptr = top;
				while (ptr->parent)
				{
					/* set child link */
					ptr->parent->child = ptr;
					/* move childoffnum.. */
					if (ptr == top)
					{
						/* first iteration */
						poff = ptr->parent->childoffnum;
						ptr->parent->childoffnum = ptr->childoffnum;
					}
					else
					{
						OffsetNumber tmp = ptr->parent->childoffnum;

						ptr->parent->childoffnum = poff;
						poff = tmp;
					}
					ptr = ptr->parent;
				}
				top->childoffnum = i;
				UnlockReleaseBuffer(buffer);
				return top;
			}
			else
			{
				/* Install next inner page to the end of stack */
				ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
				ptr->blkno = blkno;
				ptr->childoffnum = i;	/* set offsetnumber of child to child
										 * !!! */
				ptr->parent = top;
				ptr->next = NULL;
				tail->next = ptr;
				tail = ptr;
			}
		}

		UnlockReleaseBuffer(buffer);
		top = top->next;
	}

	return NULL;
}
Esempio n. 18
0
static void
bitmap_xlog_insert_lovmeta(bool redo, XLogRecPtr lsn, XLogRecord* record)
{
	xl_bm_lovmetapage	*xlrec = (xl_bm_lovmetapage*)XLogRecGetData(record);
	Relation reln;

	reln = XLogOpenRelation(xlrec->bm_node);
	/* reln = XLogOpenRelation(redo, RM_BITMAP_ID, xlrec->bm_node);*/
	
	if (!RelationIsValid(reln))
		return;

	if (redo)
	{
		Buffer	lovMetabuf;
		Page	lovMetapage;
		BMLOVMetaItem	copyMetaItems, metaItems;

#ifdef BM_DEBUG
		ereport(LOG, (errcode(LOG), 
			errmsg("call bitmap_xlog_insert_lovmeta: redo=%d\n", redo)));
#endif

		lovMetabuf = XLogReadBuffer(false, reln, BM_LOV_STARTPAGE-1);
		if (!BufferIsValid(lovMetabuf))
			elog(PANIC, "bm_insert_redo: block unfound: %d -- at (%d,%d,%d)", 
				 BM_LOV_STARTPAGE-1, xlrec->bm_node.spcNode, 
				 xlrec->bm_node.dbNode, xlrec->bm_node.relNode);

		lovMetapage = BufferGetPage(lovMetabuf);

		if (XLByteLT(PageGetLSN(lovMetapage), lsn))
		{
#ifdef BM_DEBUG
			uint32 attno;
#endif

			copyMetaItems = (BMLOVMetaItem)PageGetContents(lovMetapage);

			metaItems = (BMLOVMetaItem)
				((char*)xlrec + sizeof(xl_bm_lovmetapage));
			memcpy(copyMetaItems, metaItems, 
					xlrec->bm_num_of_attrs * sizeof(BMLOVMetaItemData));

#ifdef BM_DEBUG
			for(attno=0; attno<xlrec->bm_num_of_attrs; attno++)
				elog(LOG, "metaItems=%d, %d, %d", 
					 copyMetaItems[attno].bm_lov_heapId,
					 copyMetaItems[attno].bm_lov_indexId, 
					 copyMetaItems[attno].bm_lov_lastpage);
#endif

			PageSetLSN(lovMetapage, lsn);
			PageSetTLI(lovMetapage, ThisTimeLineID);
			_bitmap_wrtbuf(lovMetabuf);
		}

		else
			_bitmap_relbuf(lovMetabuf);
	}

	else
		elog(PANIC, "bm_insert_undo: not implemented.");		
}
Esempio n. 19
0
static void
bitmap_xlog_insert_lovitem(bool redo, XLogRecPtr lsn, XLogRecord* record)
{
	xl_bm_lovitem	*xlrec = (xl_bm_lovitem*) XLogRecGetData(record);
	Relation reln;

	reln = XLogOpenRelation(xlrec->bm_node);
	if (!RelationIsValid(reln))
		return;

	if (redo)
	{
		Buffer			lovBuffer;
		Page			lovPage;

#ifdef BM_DEBUG
		ereport(LOG, (errcode(LOG), 
			errmsg("call bitmap_xlog_insert_lovitem: redo=%d, blkno=%d\n", 
					redo, xlrec->bm_lov_blkno)));
#endif

		lovBuffer = XLogReadBuffer(false, reln, xlrec->bm_lov_blkno);
		if (!BufferIsValid(lovBuffer))
			elog(PANIC, "bm_insert_redo: block unfound: %d",
				 xlrec->bm_lov_blkno);

		lovPage = BufferGetPage(lovBuffer);

		if (XLByteLT(PageGetLSN(lovPage), lsn))
		{
			if(xlrec->bm_isNewItem)
			{
				OffsetNumber	newOffset, itemSize;

				newOffset = OffsetNumberNext(PageGetMaxOffsetNumber(lovPage));
				if (newOffset != xlrec->bm_lov_offset)
					elog(PANIC, 
				"bm_insert_redo: LOV item is not inserted in pos %d(requested %d)",
						 newOffset, xlrec->bm_lov_offset);

				itemSize = sizeof(BMLOVItemData);
				if (itemSize > PageGetFreeSpace(lovPage))
					elog(PANIC, 
						 "bm_insert_redo: not enough space in LOV page %d",
						 xlrec->bm_lov_blkno);
		
				if (PageAddItem(lovPage, (Item)&(xlrec->bm_lovItem), itemSize, 
								newOffset, LP_USED) == InvalidOffsetNumber)
					ereport(ERROR,
							(errcode(ERRCODE_INTERNAL_ERROR),
							errmsg("failed to add LOV item to \"%s\"",
							RelationGetRelationName(reln))));
			}

			else{
				BMLOVItem oldLovItem;
				oldLovItem = (BMLOVItem)
					PageGetItem(lovPage, 
								PageGetItemId(lovPage, xlrec->bm_lov_offset));

				memcpy(oldLovItem, &(xlrec->bm_lovItem), sizeof(BMLOVItemData));
			}

			PageSetLSN(lovPage, lsn);
			PageSetTLI(lovPage, ThisTimeLineID);
			_bitmap_wrtbuf(lovBuffer);
		}

		else {
			_bitmap_relbuf(lovBuffer);
		}
	}

	else
		elog(PANIC, "bm_insert_undo: not implemented.");
}
Esempio n. 20
0
/*
 * Workhouse routine for doing insertion into a GiST index. Note that
 * this routine assumes it is invoked in a short-lived memory context,
 * so it does not bother releasing palloc'd allocations.
 */
static void
gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
{
	ItemId		iid;
	IndexTuple	idxtuple;
	GISTInsertStack firststack;
	GISTInsertStack *stack;
	GISTInsertState state;
	bool		xlocked = false;

	memset(&state, 0, sizeof(GISTInsertState));
	state.freespace = freespace;
	state.r = r;

	/* Start from the root */
	firststack.blkno = GIST_ROOT_BLKNO;
	firststack.lsn.xrecoff = 0;
	firststack.parent = NULL;
	state.stack = stack = &firststack;

	/*
	 * Walk down along the path of smallest penalty, updating the parent
	 * pointers with the key we're inserting as we go. If we crash in the
	 * middle, the tree is consistent, although the possible parent updates
	 * were a waste.
	 */
	for (;;)
	{
		if (XLogRecPtrIsInvalid(stack->lsn))
			stack->buffer = ReadBuffer(state.r, stack->blkno);

		/*
		 * Be optimistic and grab shared lock first. Swap it for an
		 * exclusive lock later if we need to update the page.
		 */
		if (!xlocked)
		{
			LockBuffer(stack->buffer, GIST_SHARE);
			gistcheckpage(state.r, stack->buffer);
		}

		stack->page = (Page) BufferGetPage(stack->buffer);
		stack->lsn = PageGetLSN(stack->page);
		Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn));

		/*
		 * If this page was split but the downlink was never inserted to
		 * the parent because the inserting backend crashed before doing
		 * that, fix that now.
		 */
		if (GistFollowRight(stack->page))
		{
			if (!xlocked)
			{
				LockBuffer(stack->buffer, GIST_UNLOCK);
				LockBuffer(stack->buffer, GIST_EXCLUSIVE);
				xlocked = true;
				/* someone might've completed the split when we unlocked */
				if (!GistFollowRight(stack->page))
					continue;
			}
			gistfixsplit(&state, giststate);

			UnlockReleaseBuffer(stack->buffer);
			xlocked = false;
			state.stack = stack = stack->parent;
			continue;
		}

		if (stack->blkno != GIST_ROOT_BLKNO &&
			XLByteLT(stack->parent->lsn,
					 GistPageGetOpaque(stack->page)->nsn))
		{
			/*
			 * Concurrent split detected. There's no guarantee that the
			 * downlink for this page is consistent with the tuple we're
			 * inserting anymore, so go back to parent and rechoose the
			 * best child.
			 */
			UnlockReleaseBuffer(stack->buffer);
			xlocked = false;
			state.stack = stack = stack->parent;
			continue;
		}

		if (!GistPageIsLeaf(stack->page))
		{
			/*
			 * This is an internal page so continue to walk down the tree.
			 * Find the child node that has the minimum insertion penalty.
			 */
			BlockNumber childblkno;
			IndexTuple newtup;
			GISTInsertStack *item;

			stack->childoffnum = gistchoose(state.r, stack->page, itup, giststate);
			iid = PageGetItemId(stack->page, stack->childoffnum);
			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
			childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));

			/*
			 * Check that it's not a leftover invalid tuple from pre-9.1
			 */
			if (GistTupleIsInvalid(idxtuple))
				ereport(ERROR,
						(errmsg("index \"%s\" contains an inner tuple marked as invalid",
								RelationGetRelationName(r)),
						 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."),
						 errhint("Please REINDEX it.")));

			/*
			 * Check that the key representing the target child node is
			 * consistent with the key we're inserting. Update it if it's not.
			 */
			newtup = gistgetadjusted(state.r, idxtuple, itup, giststate);
			if (newtup)
			{
				/*
				 * Swap shared lock for an exclusive one. Beware, the page
				 * may change while we unlock/lock the page...
				 */
				if (!xlocked)
				{
					LockBuffer(stack->buffer, GIST_UNLOCK);
					LockBuffer(stack->buffer, GIST_EXCLUSIVE);
					xlocked = true;
					stack->page = (Page) BufferGetPage(stack->buffer);

					if (!XLByteEQ(PageGetLSN(stack->page), stack->lsn))
					{
						/* the page was changed while we unlocked it, retry */
						continue;
					}
				}
				/*
				 * Update the tuple.
				 *
				 * gistinserthere() might have to split the page to make the
				 * updated tuple fit. It will adjust the stack so that after
				 * the call, we'll be holding a lock on the page containing
				 * the tuple, which might have moved right.
				 *
				 * Except if this causes a root split, gistinserthere()
				 * returns 'true'. In that case, stack only holds the new
				 * root, and the child page was released. Have to start
				 * all over.
				 */
				if (gistinserttuples(&state, stack, giststate, &newtup, 1,
									 stack->childoffnum, InvalidBuffer))
				{
					UnlockReleaseBuffer(stack->buffer);
					xlocked = false;
					state.stack = stack = stack->parent;
					continue;
				}
			}
			LockBuffer(stack->buffer, GIST_UNLOCK);
			xlocked = false;

			/* descend to the chosen child */
			item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
			item->blkno = childblkno;
			item->parent = stack;
			state.stack = stack = item;
		}
		else
		{
			/*
			 * Leaf page. Insert the new key. We've already updated all the
			 * parents on the way down, but we might have to split the page
			 * if it doesn't fit. gistinserthere() will take care of that.
			 */

			/*
			 * Swap shared lock for an exclusive one. Be careful, the page
			 * may change while we unlock/lock the page...
			 */
			if (!xlocked)
			{
				LockBuffer(stack->buffer, GIST_UNLOCK);
				LockBuffer(stack->buffer, GIST_EXCLUSIVE);
				xlocked = true;
				stack->page = (Page) BufferGetPage(stack->buffer);
				stack->lsn = PageGetLSN(stack->page);

				if (stack->blkno == GIST_ROOT_BLKNO)
				{
					/*
					 * the only page that can become inner instead of leaf
					 * is the root page, so for root we should recheck it
					 */
					if (!GistPageIsLeaf(stack->page))
					{
						/*
						 * very rare situation: during unlock/lock index with
						 * number of pages = 1 was increased
						 */
						LockBuffer(stack->buffer, GIST_UNLOCK);
						xlocked = false;
						continue;
					}

					/*
					 * we don't need to check root split, because checking
					 * leaf/inner is enough to recognize split for root
					 */
				}
				else if (GistFollowRight(stack->page) ||
						 XLByteLT(stack->parent->lsn,
								  GistPageGetOpaque(stack->page)->nsn))
				{
					/*
					 * The page was split while we momentarily unlocked the
					 * page. Go back to parent.
					 */
					UnlockReleaseBuffer(stack->buffer);
					xlocked = false;
					state.stack = stack = stack->parent;
					continue;
				}
			}

			/* now state.stack->(page, buffer and blkno) points to leaf page */

			gistinserttuples(&state, stack, giststate, &itup, 1,
							 InvalidOffsetNumber, InvalidBuffer);
			LockBuffer(stack->buffer, GIST_UNLOCK);

			/* Release any pins we might still hold before exiting */
			for (; stack; stack = stack->parent)
				ReleaseBuffer(stack->buffer);
			break;
		}
	}
}
Esempio n. 21
0
static void
do_failover(void)
{
	PGresult   *res;
	char		sqlquery[QUERY_STR_LEN];

	int			total_nodes = 0;
	int			visible_nodes = 0;
	int			ready_nodes = 0;

	bool		find_best = false;

	int			i;
	int			r;

	uint32		uxlogid;
	uint32		uxrecoff;
	XLogRecPtr	xlog_recptr;

	char		last_wal_standby_applied[MAXLEN];

	PGconn	   *node_conn = NULL;

	/*
	 * will get info about until 50 nodes, which seems to be large enough for
	 * most scenarios
	 */
	t_node_info nodes[50];

	/* initialize to keep compiler quiet */
	t_node_info best_candidate = {-1, "", InvalidXLogRecPtr, false, false, false};

	/* get a list of standby nodes, including myself */
	sprintf(sqlquery, "SELECT id, conninfo, witness "
			"  FROM %s.repl_nodes "
			" WHERE cluster = '%s' "
			" ORDER BY priority, id ",
			repmgr_schema, local_options.cluster_name);

	res = PQexec(my_local_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("Can't get nodes' info: %s\n"), PQerrorMessage(my_local_conn));
		PQclear(res);
		terminate(ERR_DB_QUERY);
	}

	/*
	 * total nodes that are registered
	 */
	total_nodes = PQntuples(res);
	log_debug(_("%s: there are %d nodes registered\n"), progname, total_nodes);

	/*
	 * Build an array with the nodes and indicate which ones are visible and
	 * ready
	 */
	for (i = 0; i < total_nodes; i++)
	{
		nodes[i].node_id = atoi(PQgetvalue(res, i, 0));
		strncpy(nodes[i].conninfo_str, PQgetvalue(res, i, 1), MAXLEN);
		nodes[i].is_witness = (strcmp(PQgetvalue(res, i, 2), "t") == 0) ? true : false;

		/*
		 * Initialize on false so if we can't reach this node we know that
		 * later
		 */
		nodes[i].is_visible = false;
		nodes[i].is_ready = false;

		XLAssignValue(nodes[i].xlog_location, 0, 0);

		log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s\n"),
				  progname, nodes[i].node_id, nodes[i].conninfo_str,
				  (nodes[i].is_witness) ? "true" : "false");

		node_conn = establish_db_connection(nodes[i].conninfo_str, false);

		/* if we can't see the node just skip it */
		if (PQstatus(node_conn) != CONNECTION_OK)
		{
			if (node_conn != NULL)
				PQfinish(node_conn);

			continue;
		}

		visible_nodes++;
		nodes[i].is_visible = true;

		PQfinish(node_conn);
	}
	PQclear(res);

	log_debug(_("Total nodes counted: registered=%d, visible=%d\n"),
			  total_nodes, visible_nodes);

	/*
	 * am i on the group that should keep alive? if i see less than half of
	 * total_nodes then i should do nothing
	 */
	if (visible_nodes < (total_nodes / 2.0))
	{
		log_err(_("Can't reach most of the nodes.\n"
				  "Let the other standby servers decide which one will be the primary.\n"
		"Manual action will be needed to readd this node to the cluster.\n"));
		terminate(ERR_FAILOVER_FAIL);
	}

	/* Query all the nodes to determine which ones are ready */
	for (i = 0; i < total_nodes; i++)
	{
		/* if the node is not visible, skip it */
		if (!nodes[i].is_visible)
			continue;

		if (nodes[i].is_witness)
			continue;

		node_conn = establish_db_connection(nodes[i].conninfo_str, false);

		/*
		 * XXX This shouldn't happen, if this happens it means this is a major
		 * problem maybe network outages? anyway, is better for a human to
		 * react
		 */
		if (PQstatus(node_conn) != CONNECTION_OK)
		{
			log_err(_("It seems new problems are arising, manual intervention is needed\n"));
			terminate(ERR_FAILOVER_FAIL);
		}

		uxlogid = 0;
		uxrecoff = 0;

		sqlquery_snprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
		res = PQexec(node_conn, sqlquery);
		if (PQresultStatus(res) != PGRES_TUPLES_OK)
		{
			log_info(_("Can't get node's last standby location: %s\n"),
					 PQerrorMessage(node_conn));
			log_info(_("Connection details: %s\n"), nodes[i].conninfo_str);
			PQclear(res);
			PQfinish(node_conn);
			terminate(ERR_FAILOVER_FAIL);
		}

		if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
			log_info(_("could not parse transaction log location \"%s\"\n"),
					 PQgetvalue(res, 0, 0));

		log_debug("XLog position of node %d: log id=%u (%X), offset=%u (%X)\n",
				  nodes[i].node_id, uxlogid, uxlogid, uxrecoff, uxrecoff);

		/* If position is 0/0, error */
		if (uxlogid == 0 && uxrecoff == 0)
		{
			PQclear(res);
			PQfinish(node_conn);
			log_info(_("InvalidXLogRecPtr detected in a standby\n"));
			terminate(ERR_FAILOVER_FAIL);
		}

		XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff);

		PQclear(res);
		PQfinish(node_conn);
	}

	/* last we get info about this node, and update shared memory */
	sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
	res = PQexec(my_local_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s.\nReport an invalid value to not be "
				  " considered as new primary and exit.\n"),
				PQerrorMessage(my_local_conn));
		PQclear(res);
		sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
		update_shared_memory(last_wal_standby_applied);
		terminate(ERR_DB_QUERY);
	}

	/* write last location in shared memory */
	update_shared_memory(PQgetvalue(res, 0, 0));
	PQclear(res);

	for (i = 0; i < total_nodes; i++)
	{
		while (!nodes[i].is_ready)
		{
			/*
			 * the witness will always be masked as ready if it's still not
			 * marked that way and avoid a useless query
			 */
			if (nodes[i].is_witness)
			{
				if (!nodes[i].is_ready)
				{
					nodes[i].is_ready = true;
					ready_nodes++;
				}
				break;
			}

			/* if the node is not visible, skip it */
			if (!nodes[i].is_visible)
				break;

			/* if the node is ready there is nothing to check, skip it too */
			if (nodes[i].is_ready)
				break;

			node_conn = establish_db_connection(nodes[i].conninfo_str, false);

			/*
			 * XXX This shouldn't happen, if this happens it means this is a
			 * major problem maybe network outages? anyway, is better for a
			 * human to react
			 */
			if (PQstatus(node_conn) != CONNECTION_OK)
			{
				/* XXX */
				log_info(_("At this point, it could be some race conditions "
						"that are acceptable, assume the node is restarting "
						   "and starting failover procedure\n"));
				break;
			}

			uxlogid = 0;
			uxrecoff = 0;

			sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()",
							  repmgr_schema);
			res = PQexec(node_conn, sqlquery);
			if (PQresultStatus(res) != PGRES_TUPLES_OK)
			{
				log_err(_("PQexec failed: %s.\nReport an invalid value to not"
						  "be considered as new primary and exit.\n"),
						PQerrorMessage(node_conn));
				PQclear(res);
				PQfinish(node_conn);
				terminate(ERR_DB_QUERY);
			}

			if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
			{
				log_info(_("could not parse transaction log location \"%s\"\n"),
						 PQgetvalue(res, 0, 0));

				/* we can't do anything but fail at this point... */
				if (*PQgetvalue(res, 0, 0) == '\0')
				{
					log_crit("Whoops, seems as if shared_preload_libraries=repmgr_funcs is not set!\n");
					exit(ERR_BAD_CONFIG);
				}
			}


			PQclear(res);
			PQfinish(node_conn);
			/* If position is 0/0, keep checking */
			if (uxlogid == 0 && uxrecoff == 0)
				continue;

			XLAssignValue(xlog_recptr, uxlogid, uxrecoff);

			if (XLByteLT(nodes[i].xlog_location, xlog_recptr))
			{
				XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff);
			}

			log_debug("Last XLog position of node %d: log id=%u (%X), offset=%u (%X)\n",
					  nodes[i].node_id, uxlogid, uxlogid,
					  uxrecoff, uxrecoff);

			ready_nodes++;
			nodes[i].is_ready = true;
		}
	}

	/* Close the connection to this server */
	PQfinish(my_local_conn);
	my_local_conn = NULL;

	/*
	 * determine which one is the best candidate to promote to primary
	 */
	for (i = 0; i < total_nodes; i++)
	{
		/* witness is never a good candidate */
		if (nodes[i].is_witness)
			continue;

		if (!nodes[i].is_ready || !nodes[i].is_visible)
			continue;

		if (!find_best)
		{
			/*
			 * start with the first ready node, and then move on to the next
			 * one
			 */
			best_candidate.node_id = nodes[i].node_id;
			XLAssign(best_candidate.xlog_location, nodes[i].xlog_location);
			best_candidate.is_ready = nodes[i].is_ready;
			best_candidate.is_witness = nodes[i].is_witness;
			find_best = true;
		}

		/* we use the macros provided by xlogdefs.h to compare XLogRecPtr */

		/*
		 * Nodes are retrieved ordered by priority, so if the current best
		 * candidate is lower than the next node's wal location then assign
		 * next node as the new best candidate.
		 */
		if (XLByteLT(best_candidate.xlog_location, nodes[i].xlog_location))
		{
			best_candidate.node_id = nodes[i].node_id;
			XLAssign(best_candidate.xlog_location, nodes[i].xlog_location);
			best_candidate.is_ready = nodes[i].is_ready;
			best_candidate.is_witness = nodes[i].is_witness;
		}
	}

	/* once we know who is the best candidate, promote it */
	if (find_best && (best_candidate.node_id == local_options.node))
	{
		if (best_candidate.is_witness)
		{
			log_err(_("%s: Node selected as new master is a witness. Can't be promoted.\n"),
					progname);
			terminate(ERR_FAILOVER_FAIL);
		}

		/* wait */
		sleep(5);

		if (verbose)
			log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"),
					 progname);
		log_debug(_("promote command is: \"%s\"\n"),
				  local_options.promote_command);

		if (log_type == REPMGR_STDERR && *local_options.logfile)
		{
			fflush(stderr);
		}

		r = system(local_options.promote_command);
		if (r != 0)
		{
			log_err(_("%s: promote command failed. You could check and try it manually.\n"),
					progname);
			terminate(ERR_BAD_CONFIG);
		}
	}
	else if (find_best)
	{
		/* wait */
		sleep(10);

		if (verbose)
			log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"),
					 progname, best_candidate.node_id);
		log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command);

		/*
		 * New Primary need some time to be promoted. The follow command
		 * should take care of that.
		 */
		if (log_type == REPMGR_STDERR && *local_options.logfile)
		{
			fflush(stderr);
		}

		r = system(local_options.follow_command);
		if (r != 0)
		{
			log_err(_("%s: follow command failed. You could check and try it manually.\n"),
					progname);
			terminate(ERR_BAD_CONFIG);
		}
	}
	else
	{
		log_err(_("%s: Did not find candidates. You should check and try manually.\n"),
				progname);
		terminate(ERR_FAILOVER_FAIL);
	}

	/* to force it to re-calculate mode and master node */
	failover_done = true;

	/* and reconnect to the local database */
	my_local_conn = establish_db_connection(local_options.conninfo, true);
}
Esempio n. 22
0
static void
do_failover(void)
{
	PGresult *res1;
	PGresult *res2;
	char 	sqlquery[8192];

	int		total_nodes = 0;
	int		visible_nodes = 0;
	bool	find_best = false;
	bool	witness = false;

	int		i;
	int		r;

	int 	node;
	char	nodeConninfo[MAXLEN];

	unsigned int uxlogid;
	unsigned int uxrecoff;
	char last_wal_standby_applied[MAXLEN];

	PGconn	*nodeConn = NULL;

	/*
	 * will get info about until 50 nodes,
	 * which seems to be large enough for most scenarios
	 */
	nodeInfo nodes[50];
	/* initialize to keep compiler quiet */
	nodeInfo best_candidate = {-1, InvalidXLogRecPtr, false, false};

	/* first we get info about this node, and update shared memory */
	sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
	res1 = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
		PQclear(res1);
		sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
		update_shared_memory(last_wal_standby_applied);
		exit(ERR_DB_QUERY);
	}

	/* write last location in shared memory */
	update_shared_memory(PQgetvalue(res1, 0, 0));

	/*
	 * we sleep the monitor time + one second
	 * we bet it should be enough for other repmgrd to update their own data
	 */
	sleep(SLEEP_MONITOR + 1);

	/* get a list of standby nodes, including myself */
	sprintf(sqlquery, "SELECT id, conninfo, witness "
	        "  FROM %s.repl_nodes "
	        " WHERE id <> %d "
	        "   AND cluster = '%s' "
	        " ORDER BY priority ",
	        repmgr_schema, primary_options.node, local_options.cluster_name);

	res1 = PQexec(myLocalConn, sqlquery);
	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
	{
		log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn));
		PQclear(res1);
		PQfinish(myLocalConn);
		exit(ERR_DB_QUERY);
	}

	log_debug(_("%s: there are %d nodes registered"), progname, PQntuples(res1));
	/* ask for the locations */
	for (i = 0; i < PQntuples(res1); i++)
	{
		node = atoi(PQgetvalue(res1, i, 0));
		/* Initialize on false so if we can't reach this node we know that later */
		nodes[i].is_ready = false;
		strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN);
		witness = (strcmp(PQgetvalue(res1, i, 2), "t") == 0) ? true : false;

		log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s"), progname, node, nodeConninfo, (witness) ? "true" : "false");

		nodeConn = establishDBConnection(nodeConninfo, false);
		/* if we can't see the node just skip it */
		if (PQstatus(nodeConn) != CONNECTION_OK)
			continue;

		/* the witness will always show 0/0 so avoid a useless query */
		if (!witness)
		{
			sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema);
			res2 = PQexec(nodeConn, sqlquery);
			if (PQresultStatus(res2) != PGRES_TUPLES_OK)
			{
				log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
				log_info(_("Connection details: %s\n"), nodeConninfo);
				PQclear(res2);
				PQfinish(nodeConn);
				continue;
			}

			if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
				log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0));

			PQclear(res2);
		}
		else
		{
			uxlogid = 0;
			uxrecoff = 0;
		}

		visible_nodes++;

		nodes[i].nodeId = node;
		nodes[i].xlog_location.xlogid = uxlogid;
		nodes[i].xlog_location.xrecoff = uxrecoff;
		nodes[i].is_ready = true;
		nodes[i].is_witness = witness;

		PQfinish(nodeConn);
	}
	PQclear(res1);
	/* Close the connection to this server */
	PQfinish(myLocalConn);

	/*
	 * total nodes that are registered, include master which is a node but was
	 * not counted because it's not a standby
	 */
	total_nodes = i + 1;

	/*
	 * am i on the group that should keep alive?
	 * if i see less than half of total_nodes then i should do nothing
	 */
	if (visible_nodes < (total_nodes / 2.0))
	{
		log_err(_("Can't reach most of the nodes.\n"
		          "Let the other standby servers decide which one will be the primary.\n"
		          "Manual action will be needed to readd this node to the cluster.\n"));
		exit(ERR_FAILOVER_FAIL);
	}

	/*
	 * determine which one is the best candidate to promote to primary
	 */
	for (i = 0; i < total_nodes - 1; i++)
	{
		/* witness is never a good candidate */
		if (nodes[i].is_witness)
			continue;

		if (!nodes[i].is_ready)
			continue;

		if (!find_best)
		{
			/* start with the first ready node, and then move on to the next one */
			best_candidate.nodeId                = nodes[i].nodeId;
			best_candidate.xlog_location.xlogid  = nodes[i].xlog_location.xlogid;
			best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
			best_candidate.is_ready              = nodes[i].is_ready;
			best_candidate.is_witness            = nodes[i].is_witness;
			find_best = true;
		}

		/* we use the macros provided by xlogdefs.h to compare XLogRecPtr */
		/*
		 * Nodes are retrieved ordered by priority, so if the current
		 * best candidate is lower than the next node's wal location
		 * then assign next node as the new best candidate.
		 */
		if (XLByteLT(best_candidate.xlog_location, nodes[i].xlog_location))
		{
			best_candidate.nodeId                = nodes[i].nodeId;
			best_candidate.xlog_location.xlogid  = nodes[i].xlog_location.xlogid;
			best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
			best_candidate.is_ready              = nodes[i].is_ready;
			best_candidate.is_witness            = nodes[i].is_witness;
		}
	}

	/* once we know who is the best candidate, promote it */
	if (find_best && (best_candidate.nodeId == local_options.node))
	{
		if (best_candidate.is_witness)
		{
			log_err(_("%s: Node selected as new master is a witness. Can't be promoted.\n"), progname);
			exit(ERR_FAILOVER_FAIL);
		}

		if (verbose)
			log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"),
			         progname);
		log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command);
		r = system(local_options.promote_command);
		if (r != 0)
		{
			log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname);
			exit(ERR_BAD_CONFIG);
		}
	}
	else if (find_best)
	{
		if (verbose)
			log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"),
			         progname, best_candidate.nodeId);
		log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command);
		/*
		 * New Primary need some time to be promoted.
		 * The follow command should take care of that.
		 */
		r = system(local_options.follow_command);
		if (r != 0)
		{
			log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname);
			exit(ERR_BAD_CONFIG);
		}
	}
	else
	{
		log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname);
		exit(ERR_FAILOVER_FAIL);
	}

	/* and reconnect to the local database */
	myLocalConn = establishDBConnection(local_options.conninfo, true);
}
Esempio n. 23
0
static void
bitmap_xlog_insert_bitmap(bool redo, XLogRecPtr lsn, XLogRecord* record)
{
	xl_bm_bitmappage	*xlrec = (xl_bm_bitmappage*) XLogRecGetData(record);
	Relation reln;

	reln = XLogOpenRelation(xlrec->bm_node);
	if (!RelationIsValid(reln))
		return;

	if (redo)
	{
		Buffer	bitmapBuffer;
		Page	bitmapPage;
		BMBitmapOpaque	bitmapPageOpaque ;

		bitmapBuffer = XLogReadBuffer(false, reln, xlrec->bm_bitmap_blkno);
		if (!BufferIsValid(bitmapBuffer))
			elog(PANIC, "bm_insert_redo: block unfound: %d",
				 xlrec->bm_bitmap_blkno);

		bitmapPage = BufferGetPage(bitmapBuffer);

		if (XLByteLT(PageGetLSN(bitmapPage), lsn))
		{
			bitmapPageOpaque = (BMBitmapOpaque)PageGetSpecialPointer(bitmapPage);;

#ifdef BM_DEBUG
			ereport(LOG, (errcode(LOG), 
				errmsg("call bitmap_xlog_insert_bitmap: redo=%d, blkno=%d, isOpaque=%d, words_used=%d, lastword=%d, next_blkno=%d\n", redo, xlrec->bm_bitmap_blkno, xlrec->bm_isOpaque, xlrec->bm_lastword_pos, xlrec->bm_lastword_in_block, xlrec->bm_next_blkno)));
#endif

			if (xlrec->bm_isOpaque)
			{
				if (bitmapPageOpaque->bm_bitmap_next != InvalidBlockNumber)
					elog(PANIC, 
						"%s next bitmap page for blkno %d is already set",
						"bm_insert_redo: ",
						xlrec->bm_bitmap_blkno);
				Assert(bitmapPageOpaque->bm_hrl_words_used == 
						BM_NUM_OF_HRL_WORDS_PER_PAGE);

				bitmapPageOpaque->bm_bitmap_next = xlrec->bm_next_blkno;
			}

			else 
			{
				BMBitmap	bitmap;

				if (bitmapPageOpaque->bm_hrl_words_used != 
					xlrec->bm_lastword_pos - 1)
					elog(PANIC, 
						"bm_insert_redo: a bit has been inserted in the pos %d",
						xlrec->bm_lastword_pos);

				Assert (xlrec->bm_lastword_in_block != 0);

				bitmap = (BMBitmap) PageGetContents(bitmapPage);
				
				bitmap->bm_headerWords
					[(bitmapPageOpaque->bm_hrl_words_used/BM_HRL_WORD_SIZE)] |=
					(1<<(BM_HRL_WORD_SIZE-1-
					(bitmapPageOpaque->bm_hrl_words_used%BM_HRL_WORD_SIZE)));
				bitmap->bm_contentWords[bitmapPageOpaque->bm_hrl_words_used] = 
					xlrec->bm_lastword_in_block;
				bitmapPageOpaque->bm_hrl_words_used ++;
			}

			PageSetLSN(bitmapPage, lsn);
			PageSetTLI(bitmapPage, ThisTimeLineID);
			_bitmap_wrtbuf(bitmapBuffer);
		}
		else
			_bitmap_relbuf(bitmapBuffer);
	}

	else
		elog(PANIC, "bm_insert_undo: not implemented.");
}
Esempio n. 24
0
/*
 * Update the LSNs on each queue based upon our latest state. This
 * implements a simple policy of first-valid-standby-releases-waiter.
 *
 * Other policies are possible, which would change what we do here and what
 * perhaps also which information we store as well.
 */
void
SyncRepReleaseWaiters(void)
{
	volatile WalSndCtlData *walsndctl = WalSndCtl;
	volatile WalSnd *syncWalSnd = NULL;
	int 		numprocs = 0;
	int			priority = 0;
	int			i;

	/*
	 * If this WALSender is serving a standby that is not on the list of
	 * potential standbys then we have nothing to do. If we are still
	 * starting up or still running base backup, then leave quickly also.
	 */
	if (MyWalSnd->sync_standby_priority == 0 ||
		MyWalSnd->state < WALSNDSTATE_STREAMING)
		return;

	/*
	 * We're a potential sync standby. Release waiters if we are the
	 * highest priority standby. If there are multiple standbys with
	 * same priorities then we use the first mentioned standby.
	 * If you change this, also change pg_stat_get_wal_senders().
	 */
	LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);

	for (i = 0; i < max_wal_senders; i++)
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile WalSnd *walsnd = &walsndctl->walsnds[i];

		if (walsnd->pid != 0 &&
			walsnd->sync_standby_priority > 0 &&
			(priority == 0 ||
			 priority > walsnd->sync_standby_priority))
		{
			 priority = walsnd->sync_standby_priority;
			 syncWalSnd = walsnd;
		}
	}

	/*
	 * We should have found ourselves at least.
	 */
	Assert(syncWalSnd);

	/*
	 * If we aren't managing the highest priority standby then just leave.
	 */
	if (syncWalSnd != MyWalSnd)
	{
		LWLockRelease(SyncRepLock);
		announce_next_takeover = true;
		return;
	}

	if (XLByteLT(walsndctl->lsn, MyWalSnd->flush))
	{
		/*
		 * Set the lsn first so that when we wake backends they will
		 * release up to this location.
		 */
		walsndctl->lsn = MyWalSnd->flush;
		numprocs = SyncRepWakeQueue(false);
	}

	LWLockRelease(SyncRepLock);

	elog(DEBUG3, "released %d procs up to %X/%X",
					numprocs,
					MyWalSnd->flush.xlogid,
					MyWalSnd->flush.xrecoff);

	/*
	 * If we are managing the highest priority standby, though we weren't
	 * prior to this, then announce we are now the sync standby.
	 */
	if (announce_next_takeover)
	{
		announce_next_takeover = false;
		ereport(LOG,
				(errmsg("standby \"%s\" is now the synchronous standby with priority %u",
						application_name, MyWalSnd->sync_standby_priority)));
	}
}
Esempio n. 25
0
static void
gistfindleaf(GISTInsertState *state, GISTSTATE *giststate)
{
	ItemId		iid;
	IndexTuple	idxtuple;
	GISTPageOpaque opaque;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	/*
	 * walk down, We don't lock page for a long time, but so we should be
	 * ready to recheck path in a bad case... We remember, that page->lsn
	 * should never be invalid.
	 */
	for (;;)
	{
		if (XLogRecPtrIsInvalid(state->stack->lsn))
			state->stack->buffer = ReadBuffer(state->r, state->stack->blkno);
		LockBuffer(state->stack->buffer, GIST_SHARE);
		gistcheckpage(state->r, state->stack->buffer);

		state->stack->page = (Page) BufferGetPage(state->stack->buffer);
		opaque = GistPageGetOpaque(state->stack->page);

		state->stack->lsn = PageGetLSN(state->stack->page);
		Assert(state->r->rd_istemp || !XLogRecPtrIsInvalid(state->stack->lsn));

		if (state->stack->blkno != GIST_ROOT_BLKNO &&
			XLByteLT(state->stack->parent->lsn, opaque->nsn))
		{
			/*
			 * caused split non-root page is detected, go up to parent to
			 * choose best child
			 */
			UnlockReleaseBuffer(state->stack->buffer);
			state->stack = state->stack->parent;
			continue;
		}

		if (!GistPageIsLeaf(state->stack->page))
		{
			/*
			 * This is an internal page, so continue to walk down the tree. We
			 * find the child node that has the minimum insertion penalty and
			 * recursively invoke ourselves to modify that node. Once the
			 * recursive call returns, we may need to adjust the parent node
			 * for two reasons: the child node split, or the key in this node
			 * needs to be adjusted for the newly inserted key below us.
			 */
			GISTInsertStack *item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));

			state->stack->childoffnum = gistchoose(state->r, state->stack->page, state->itup[0], giststate);

			iid = PageGetItemId(state->stack->page, state->stack->childoffnum);
			idxtuple = (IndexTuple) PageGetItem(state->stack->page, iid);
			item->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
			LockBuffer(state->stack->buffer, GIST_UNLOCK);

			item->parent = state->stack;
			item->child = NULL;
			if (state->stack)
				state->stack->child = item;
			state->stack = item;
		}
		else
		{
			/* be carefull, during unlock/lock page may be changed... */
			LockBuffer(state->stack->buffer, GIST_UNLOCK);
			LockBuffer(state->stack->buffer, GIST_EXCLUSIVE);
			state->stack->page = (Page) BufferGetPage(state->stack->buffer);
			opaque = GistPageGetOpaque(state->stack->page);

			if (state->stack->blkno == GIST_ROOT_BLKNO)
			{
				/*
				 * the only page can become inner instead of leaf is a root
				 * page, so for root we should recheck it
				 */
				if (!GistPageIsLeaf(state->stack->page))
				{
					/*
					 * very rarely situation: during unlock/lock index with
					 * number of pages = 1 was increased
					 */
					LockBuffer(state->stack->buffer, GIST_UNLOCK);
					continue;
				}

				/*
				 * we don't need to check root split, because checking
				 * leaf/inner is enough to recognize split for root
				 */

			}
			else if (XLByteLT(state->stack->parent->lsn, opaque->nsn))
			{
				/*
				 * detecting split during unlock/lock, so we should find
				 * better child on parent
				 */

				/* forget buffer */
				UnlockReleaseBuffer(state->stack->buffer);

				state->stack = state->stack->parent;
				continue;
			}

			state->stack->lsn = PageGetLSN(state->stack->page);

			/* ok we found a leaf page and it X-locked */
			break;
		}
	}

	/* now state->stack->(page, buffer and blkno) points to leaf page */
}
Esempio n. 26
0
File: gistget.c Progetto: 50wu/gpdb
/*
 * Fetch a tuples that matchs the search key; this can be invoked
 * either to fetch the first such tuple or subsequent matching
 * tuples. Returns true iff a matching tuple was found.
 */
static int
gistnext(IndexScanDesc scan, ScanDirection dir, ItemPointer tids,
		 int maxtids, bool ignore_killed_tuples)
{
	MIRROREDLOCK_BUFMGR_DECLARE;

	Page		p;
	OffsetNumber n;
	GISTScanOpaque so;
	GISTSearchStack *stk;
	IndexTuple	it;
	GISTPageOpaque opaque;
	int			ntids = 0;

	so = (GISTScanOpaque) scan->opaque;

	// -------- MirroredLock ----------
	MIRROREDLOCK_BUFMGR_LOCK;

	if ( so->qual_ok == false )
		return 0;

	if (ItemPointerIsValid(&so->curpos) == false)
	{
		/* Being asked to fetch the first entry, so start at the root */
		Assert(so->curbuf == InvalidBuffer);
		Assert(so->stack == NULL);

		so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO);

		stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack));

		stk->next = NULL;
		stk->block = GIST_ROOT_BLKNO;

		pgstat_count_index_scan(scan->indexRelation);
	}
	else if (so->curbuf == InvalidBuffer)
	{
		MIRROREDLOCK_BUFMGR_UNLOCK;
		// -------- MirroredLock ----------

		return 0;
	}

	/*
	 * check stored pointers from last visit 
	 */
	if ( so->nPageData > 0 ) 
	{
		while( ntids < maxtids && so->curPageData < so->nPageData )
		{
			tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr;
			ItemPointerSet(&(so->curpos),
							   BufferGetBlockNumber(so->curbuf), 
							   so->pageData[ so->curPageData ].pageOffset);

				
			so->curPageData ++;
			ntids++;
		}

		if ( ntids == maxtids )
		{
			MIRROREDLOCK_BUFMGR_UNLOCK;
			// -------- MirroredLock ----------

			return ntids;
		}
		
		/*
		 * Go to the next page
		 */
		stk = so->stack->next;
		pfree(so->stack);
		so->stack = stk;

		/* If we're out of stack entries, we're done */
		if (so->stack == NULL)
		{
			ReleaseBuffer(so->curbuf);
			so->curbuf = InvalidBuffer;

			MIRROREDLOCK_BUFMGR_UNLOCK;
			// -------- MirroredLock ----------

			return ntids;
		}

		so->curbuf = ReleaseAndReadBuffer(so->curbuf,
										  scan->indexRelation,
										  stk->block);
	}

	for (;;)
	{
		/* First of all, we need lock buffer */
		Assert(so->curbuf != InvalidBuffer);
		LockBuffer(so->curbuf, GIST_SHARE);
		gistcheckpage(scan->indexRelation, so->curbuf);
		p = BufferGetPage(so->curbuf);
		opaque = GistPageGetOpaque(p);

		/* remember lsn to identify page changed for tuple's killing */
		so->stack->lsn = PageGetLSN(p);

		/* check page split, occured from last visit or visit to parent */
		if (!XLogRecPtrIsInvalid(so->stack->parentlsn) &&
			XLByteLT(so->stack->parentlsn, opaque->nsn) &&
			opaque->rightlink != InvalidBlockNumber /* sanity check */ &&
			(so->stack->next == NULL || so->stack->next->block != opaque->rightlink)		/* check if already
					added */ )
		{
			/* detect page split, follow right link to add pages */

			stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack));
			stk->next = so->stack->next;
			stk->block = opaque->rightlink;
			stk->parentlsn = so->stack->parentlsn;
			memset(&(stk->lsn), 0, sizeof(GistNSN));
			so->stack->next = stk;
		}

		/* if page is empty, then just skip it */
		if (PageIsEmpty(p))
		{
			LockBuffer(so->curbuf, GIST_UNLOCK);
			stk = so->stack->next;
			pfree(so->stack);
			so->stack = stk;

			if (so->stack == NULL)
			{
				ReleaseBuffer(so->curbuf);
				so->curbuf = InvalidBuffer;

				MIRROREDLOCK_BUFMGR_UNLOCK;
				// -------- MirroredLock ----------

				return ntids;
			}

			so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation,
											  stk->block);
			continue;
		}

		if (ScanDirectionIsBackward(dir))
			n = PageGetMaxOffsetNumber(p);
		else
			n = FirstOffsetNumber;

		/* wonderful, we can look at page */
		so->nPageData = so->curPageData = 0;

		for (;;)
		{
			n = gistfindnext(scan, n, dir);

			if (!OffsetNumberIsValid(n))
			{
				while( ntids < maxtids && so->curPageData < so->nPageData )
				{
					tids[ ntids ] = scan->xs_ctup.t_self = 
						so->pageData[ so->curPageData ].heapPtr;
				
					ItemPointerSet(&(so->curpos),
								   BufferGetBlockNumber(so->curbuf), 
								   so->pageData[ so->curPageData ].pageOffset);

					so->curPageData ++;
					ntids++;
				}

				if ( ntids == maxtids )
				{
					LockBuffer(so->curbuf, GIST_UNLOCK);
					
					MIRROREDLOCK_BUFMGR_UNLOCK;
					// -------- MirroredLock ----------
					
					return ntids;
				}

				/*
				 * We ran out of matching index entries on the current page,
				 * so pop the top stack entry and use it to continue the
				 * search.
				 */
				LockBuffer(so->curbuf, GIST_UNLOCK);
				stk = so->stack->next;
				pfree(so->stack);
				so->stack = stk;

				/* If we're out of stack entries, we're done */

				if (so->stack == NULL)
				{
					ReleaseBuffer(so->curbuf);
					so->curbuf = InvalidBuffer;
					
					MIRROREDLOCK_BUFMGR_UNLOCK;
					// -------- MirroredLock ----------
					
					return ntids;
				}

				so->curbuf = ReleaseAndReadBuffer(so->curbuf,
												  scan->indexRelation,
												  stk->block);
				/* XXX	go up */
				break;
			}

			if (GistPageIsLeaf(p))
			{
				/*
				 * We've found a matching index entry in a leaf page, so
				 * return success. Note that we keep "curbuf" pinned so that
				 * we can efficiently resume the index scan later.
				 */

				if (!(ignore_killed_tuples && ItemIdIsDead(PageGetItemId(p, n))))
				{
					it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
					so->pageData[ so->nPageData ].heapPtr = it->t_tid;
					so->pageData[ so->nPageData ].pageOffset = n;
					so->nPageData ++;
				}
			}
			else
			{
				/*
				 * We've found an entry in an internal node whose key is
				 * consistent with the search key, so push it to stack
				 */

				stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack));

				it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
				stk->block = ItemPointerGetBlockNumber(&(it->t_tid));
				memset(&(stk->lsn), 0, sizeof(GistNSN));
				stk->parentlsn = so->stack->lsn;

				stk->next = so->stack->next;
				so->stack->next = stk;

			}

			if (ScanDirectionIsBackward(dir))
				n = OffsetNumberPrev(n);
			else
				n = OffsetNumberNext(n);
		}
	}

	MIRROREDLOCK_BUFMGR_UNLOCK;
	// -------- MirroredLock ----------

	return ntids;
}
Esempio n. 27
0
static void
bitmap_xlog_newpage(bool redo, XLogRecPtr lsn, XLogRecord *record)
{
	xl_bm_newpage	*xlrec = (xl_bm_newpage*) XLogRecGetData(record);

	Relation		reln;
	Page			page;
	uint8			info;

	/* xl_bm_metapage	*xlrecMeta = (xl_bm_metapage*)
		((char*)xlrec+sizeof(xl_bm_newpage)); */

	info = record->xl_info & ~XLR_INFO_MASK;

  ereport(DEBUG1, (errmsg_internal("into --> XLogOpenRelation")));
	reln = XLogOpenRelation(xlrec->bm_node);
  ereport(DEBUG1, (errmsg_internal("done --> XLogOpenRelation")));
	if (!RelationIsValid(reln))
		return;
  ereport(DEBUG1, (errmsg_internal("crash1")));

	if (redo)
	{
		Buffer		buffer;

#ifdef BM_DEBUG
		ereport(LOG, (errcode(LOG), 
			errmsg("call bitmap_xlog_newpage: redo=%d, info=%x\n", redo, info)));
#endif

		buffer = XLogReadBuffer(true, reln, xlrec->bm_new_blkno);
		if (!BufferIsValid(buffer))
			elog(PANIC, "bm_insert_redo: block unfound: %d", 
				 xlrec->bm_new_blkno);

		page = BufferGetPage(buffer);

		if (XLByteLT(PageGetLSN(page), lsn))
		{
			Buffer		metabuf;
			BMMetaPage	metapage;

			switch (info)
			{
				case XLOG_BITMAP_INSERT_NEWLOV:
					_bitmap_lovpageinit(reln, buffer);
					break;
				case XLOG_BITMAP_INSERT_NEWLOVMETA:
					_bitmap_lovmetapageinit(reln, buffer);
					break;
				case XLOG_BITMAP_INSERT_NEWBITMAP:
					_bitmap_bitmappageinit(reln, buffer);
					break;
				default:
					elog(PANIC, "bitmap_redo: unknown newpage op code %u", info);
			}

			PageSetLSN(page, lsn);
			PageSetTLI(page, ThisTimeLineID);
			_bitmap_wrtbuf(buffer);

			metabuf = XLogReadBuffer(true, reln, BM_METAPAGE);
			if (!BufferIsValid(metabuf))
				elog(PANIC, "bm_insert_redo: block unfound: %d", BM_METAPAGE);
			metapage = (BMMetaPage)BufferGetPage(metabuf);

			if (XLByteLT(PageGetLSN(metapage), lsn))
			{
				PageSetLSN(metapage, lsn);
				PageSetTLI(metapage, ThisTimeLineID);
				_bitmap_wrtbuf(metabuf);
			}
			else
				_bitmap_relbuf(metabuf);
		}

		else {
			_bitmap_relbuf(buffer);
		}
	}
	else
		elog(PANIC, "bm_insert_undo: not implemented.");
  /* elog(PANIC, "call completely done for _bitmap_lovmetapageinit from  bitmap_xlog_newpage[src/backend/access/bitmap/bitmapxlog.c]", info); */
}