Example #1
0
File: ist.c Project: fdr/pg-python
/*
 * pl_ist_reset - Abort the given number of ISTs and set a Python error.
 *
 * Used to handle cases where a code object fails to resolve open transactions.
 */
void
pl_ist_reset(unsigned long count)
{
	HOLD_INTERRUPTS();
	for (; count > 0; --count)
		RollbackAndReleaseCurrentSubTransaction();
	RESUME_INTERRUPTS();
}
Example #2
0
/*
 * Cancel any pending wait for lock, when aborting a transaction, and revert
 * any strong lock count acquisition for a lock being acquired.
 *
 * (Normally, this would only happen if we accept a cancel/die
 * interrupt while waiting; but an ereport(ERROR) before or during the lock
 * wait is within the realm of possibility, too.)
 */
void
LockErrorCleanup(void)
{
	LWLock	   *partitionLock;
	DisableTimeoutParams timeouts[2];

	HOLD_INTERRUPTS();

	AbortStrongLockAcquire();

	/* Nothing to do if we weren't waiting for a lock */
	if (lockAwaited == NULL)
	{
		RESUME_INTERRUPTS();
		return;
	}

	/*
	 * Turn off the deadlock and lock timeout timers, if they are still
	 * running (see ProcSleep).  Note we must preserve the LOCK_TIMEOUT
	 * indicator flag, since this function is executed before
	 * ProcessInterrupts when responding to SIGINT; else we'd lose the
	 * knowledge that the SIGINT came from a lock timeout and not an external
	 * source.
	 */
	timeouts[0].id = DEADLOCK_TIMEOUT;
	timeouts[0].keep_indicator = false;
	timeouts[1].id = LOCK_TIMEOUT;
	timeouts[1].keep_indicator = true;
	disable_timeouts(timeouts, 2);

	/* Unlink myself from the wait queue, if on it (might not be anymore!) */
	partitionLock = LockHashPartitionLock(lockAwaited->hashcode);
	LWLockAcquire(partitionLock, LW_EXCLUSIVE);

	if (MyProc->links.next != NULL)
	{
		/* We could not have been granted the lock yet */
		RemoveFromWaitQueue(MyProc, lockAwaited->hashcode);
	}
	else
	{
		/*
		 * Somebody kicked us off the lock queue already.  Perhaps they
		 * granted us the lock, or perhaps they detected a deadlock. If they
		 * did grant us the lock, we'd better remember it in our local lock
		 * table.
		 */
		if (MyProc->waitStatus == STATUS_OK)
			GrantAwaitedLock();
	}

	lockAwaited = NULL;

	LWLockRelease(partitionLock);

	RESUME_INTERRUPTS();
}
Example #3
0
/*
 * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
 *
 * If the lock is not available, return FALSE with no side-effects.
 *
 * If successful, cancel/die interrupts are held off until lock release.
 */
bool
LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
{
	volatile LWLock *lock = LWLockArray + lockid;
	bool		mustwait;

	PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, lock);

	/*
	 * Lock out cancel/die interrupts until we exit the code section
	 * protected by the LWLock.  This ensures that interrupts will not
	 * interfere with manipulations of data structures in shared memory.
	 */
	HOLD_INTERRUPTS();

	/* Acquire mutex.  Time spent holding mutex should be short! */
	SpinLockAcquire_NoHoldoff(&lock->mutex);

	/* If I can get the lock, do so quickly. */
	if (mode == LW_EXCLUSIVE)
	{
		if (lock->exclusive == 0 && lock->shared == 0)
		{
			lock->exclusive++;
			mustwait = false;
		}
		else
			mustwait = true;
	}
	else
	{
		if (lock->exclusive == 0)
		{
			lock->shared++;
			mustwait = false;
		}
		else
			mustwait = true;
	}

	/* We are done updating shared state of the lock itself. */
	SpinLockRelease_NoHoldoff(&lock->mutex);

	if (mustwait)
	{
		/* Failed to get lock, so release interrupt holdoff */
		RESUME_INTERRUPTS();
		LOG_LWDEBUG("LWLockConditionalAcquire", lockid, "failed");
	}
	else
	{
		/* Add lock to list of locks held by this backend */
		Assert(num_held_lwlocks < MAX_SIMUL_LWLOCKS);
		held_lwlocks[num_held_lwlocks++] = lockid;
	}

	return !mustwait;
}
Example #4
0
/*
 * LWLockReleaseAll - release all currently-held locks
 *
 * Used to clean up after ereport(ERROR). An important difference between this
 * function and retail LWLockRelease calls is that InterruptHoldoffCount is
 * unchanged by this operation.  This is necessary since InterruptHoldoffCount
 * has been set to an appropriate level earlier in error recovery. We could
 * decrement it below zero if we allow it to drop for each released lock!
 */
void
LWLockReleaseAll(void)
{
	while (num_held_lwlocks > 0)
	{
		HOLD_INTERRUPTS();		/* match the upcoming RESUME_INTERRUPTS */

		LWLockRelease(held_lwlocks[num_held_lwlocks - 1]);
	}
}
Example #5
0
/*
 * Report a COMMERROR.
 *
 * This function holds an interrupt before reporting this error to avoid
 * a self deadlock situation, see MPP-13718 for more info.
 */
static void
report_commerror(const char *err_msg)
{
	HOLD_INTERRUPTS();

	ereport(COMMERROR,
			(errcode(ERRCODE_PROTOCOL_VIOLATION),
			 errmsg("%s",err_msg)));

	RESUME_INTERRUPTS();
}
Example #6
0
inline
void
AbstractionLayer::Allocator::free(void *inPtr) const {
    if (inPtr == NULL)
        return;
        
    /*
     * See allocate(const size_t, const std::nothrow_t&) why we disable
     * processing of interrupts.
     */
    HOLD_INTERRUPTS();
    PG_TRY(); {
        pfree(unaligned(inPtr));
    } PG_CATCH(); {
        FlushErrorState();
    } PG_END_TRY();
    RESUME_INTERRUPTS();
}
Example #7
0
void
FileRepSubProcess_Main()
{
	const char *statmsg;

	MemoryContext fileRepSubProcessMemoryContext;

	sigjmp_buf	local_sigjmp_buf;

	MyProcPid = getpid();

	MyStartTime = time(NULL);

	/*
	 * Create a PGPROC so we can use LWLocks in FileRep sub-processes.  The
	 * routine also register clean up at process exit
	 */
	InitAuxiliaryProcess();

	InitBufferPoolBackend();

	FileRepSubProcess_ConfigureSignals();

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * See notes in postgres.c about the design of this coding.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* Prevents interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Report the error to the server log */
		EmitErrorReport();

		LWLockReleaseAll();

		if (FileRepPrimary_IsResyncManagerOrWorker())
		{
			LockReleaseAll(DEFAULT_LOCKMETHOD, false);
		}

		if (FileRepIsBackendSubProcess(fileRepProcessType))
		{
			AbortBufferIO();
			UnlockBuffers();

			/* buffer pins are released here: */
			ResourceOwnerRelease(CurrentResourceOwner,
								 RESOURCE_RELEASE_BEFORE_LOCKS,
								 false, true);
		}

		/*
		 * We can now go away.	Note that because we'll call InitProcess, a
		 * callback will be registered to do ProcKill, which will clean up
		 * necessary state.
		 */
		proc_exit(0);
	}

	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	PG_SETMASK(&UnBlockSig);

	/*
	 * Identify myself via ps
	 */

	statmsg = FileRepProcessTypeToString[fileRepProcessType];

	init_ps_display(statmsg, "", "", "");

	/* Create the memory context where cross-transaction state is stored */
	fileRepSubProcessMemoryContext = AllocSetContextCreate(TopMemoryContext,
														   "filerep subprocess memory context",
														   ALLOCSET_DEFAULT_MINSIZE,
														   ALLOCSET_DEFAULT_INITSIZE,
														   ALLOCSET_DEFAULT_MAXSIZE);

	MemoryContextSwitchTo(fileRepSubProcessMemoryContext);

	stateChangeRequestCounter++;

	FileRepSubProcess_ProcessSignals();

	switch (fileRepProcessType)
	{
		case FileRepProcessTypePrimarySender:
			FileRepPrimary_StartSender();
			break;

		case FileRepProcessTypeMirrorReceiver:
			FileRepMirror_StartReceiver();
			break;

		case FileRepProcessTypeMirrorConsumer:
		case FileRepProcessTypeMirrorConsumerWriter:
		case FileRepProcessTypeMirrorConsumerAppendOnly1:
			FileRepMirror_StartConsumer();
			break;

		case FileRepProcessTypeMirrorSenderAck:
			FileRepAckMirror_StartSender();
			break;

		case FileRepProcessTypePrimaryReceiverAck:
			FileRepAckPrimary_StartReceiver();
			break;

		case FileRepProcessTypePrimaryConsumerAck:
			FileRepAckPrimary_StartConsumer();
			break;

		case FileRepProcessTypePrimaryRecovery:
			FileRepSubProcess_InitProcess();

			/*
			 * At this point, database is starting up and xlog is not yet
			 * replayed.  Initializing relcache now is dangerous, a sequential
			 * scan of catalog tables may end up with incorrect hint bits.
			 * E.g. a committed transaction's dirty heap pages made it to disk
			 * but pg_clog update was still in memory and we crashed.  If a
			 * tuple inserted by this transaction is read during relcache
			 * initialization, status of the tuple's xmin will be incorrectly
			 * determined as "not commited" from pg_clog. And
			 * HEAP_XMIN_INVALID hint bit will be set, rendering the tuple
			 * perpetually invisible.  Relcache initialization must be
			 * deferred to only after all of xlog has been replayed.
			 */
			FileRepPrimary_StartRecovery();

			ResourceOwnerRelease(CurrentResourceOwner,
								 RESOURCE_RELEASE_BEFORE_LOCKS,
								 false, true);
			break;

		case FileRepProcessTypeResyncManager:
			FileRepSubProcess_InitProcess();
			FileRepPrimary_StartResyncManager();

			ResourceOwnerRelease(CurrentResourceOwner,
								 RESOURCE_RELEASE_BEFORE_LOCKS,
								 false, true);
			break;

		case FileRepProcessTypeResyncWorker1:
		case FileRepProcessTypeResyncWorker2:
		case FileRepProcessTypeResyncWorker3:
		case FileRepProcessTypeResyncWorker4:
			FileRepSubProcess_InitProcess();
			FileRepPrimary_StartResyncWorker();

			ResourceOwnerRelease(CurrentResourceOwner,
								 RESOURCE_RELEASE_BEFORE_LOCKS,
								 false, true);
			break;

		default:
			elog(PANIC, "unrecognized process type: %s(%d)",
				 statmsg, fileRepProcessType);
			break;
	}

	switch (FileRepSubProcess_GetState())
	{
		case FileRepStateShutdown:
		case FileRepStateReady:
			proc_exit(0);
			break;

		default:
			proc_exit(2);
			break;
	}
}
Example #8
0
/*
 * Main entry point for checkpointer process
 *
 * This is invoked from AuxiliaryProcessMain, which has already created the
 * basic execution environment, but not enabled signals yet.
 */
void
CheckpointerMain(void)
{
	sigjmp_buf	local_sigjmp_buf;
	MemoryContext checkpointer_context;

	CheckpointerShmem->checkpointer_pid = MyProcPid;

	/*
	 * Properly accept or ignore signals the postmaster might send us
	 *
	 * Note: we deliberately ignore SIGTERM, because during a standard Unix
	 * system shutdown cycle, init will SIGTERM all processes at once.  We
	 * want to wait for the backends to exit, whereupon the postmaster will
	 * tell us it's okay to shut down (via SIGUSR2).
	 */
	pqsignal(SIGHUP, ChkptSigHupHandler);		/* set flag to read config
												 * file */
	pqsignal(SIGINT, ReqCheckpointHandler);		/* request checkpoint */
	pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
	pqsignal(SIGQUIT, chkpt_quickdie);	/* hard crash time */
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, chkpt_sigusr1_handler);
	pqsignal(SIGUSR2, ReqShutdownHandler);		/* request shutdown */

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/* We allow SIGQUIT (quickdie) at all times */
	sigdelset(&BlockSig, SIGQUIT);

	/*
	 * Initialize so that first time-driven event happens at the correct time.
	 */
	last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);

	/*
	 * Create a resource owner to keep track of our resources (currently only
	 * buffer pins).
	 */
	CurrentResourceOwner = ResourceOwnerCreate(NULL, "Checkpointer");

	/*
	 * Create a memory context that we will do all our work in.  We do this so
	 * that we can reset the context during error recovery and thereby avoid
	 * possible memory leaks.  Formerly this code just ran in
	 * TopMemoryContext, but resetting that would be a really bad idea.
	 */
	checkpointer_context = AllocSetContextCreate(TopMemoryContext,
												 "Checkpointer",
												 ALLOCSET_DEFAULT_SIZES);
	MemoryContextSwitchTo(checkpointer_context);

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * See notes in postgres.c about the design of this coding.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* Since not using PG_TRY, must reset error stack by hand */
		error_context_stack = NULL;

		/* Prevent interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Report the error to the server log */
		EmitErrorReport();

		/*
		 * These operations are really just a minimal subset of
		 * AbortTransaction().  We don't have very many resources to worry
		 * about in checkpointer, but we do have LWLocks, buffers, and temp
		 * files.
		 */
		LWLockReleaseAll();
		ConditionVariableCancelSleep();
		pgstat_report_wait_end();
		AbortBufferIO();
		UnlockBuffers();
		/* buffer pins are released here: */
		ResourceOwnerRelease(CurrentResourceOwner,
							 RESOURCE_RELEASE_BEFORE_LOCKS,
							 false, true);
		/* we needn't bother with the other ResourceOwnerRelease phases */
		AtEOXact_Buffers(false);
		AtEOXact_SMgr();
		AtEOXact_Files();
		AtEOXact_HashTables(false);

		/* Warn any waiting backends that the checkpoint failed. */
		if (ckpt_active)
		{
			SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
			CheckpointerShmem->ckpt_failed++;
			CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
			SpinLockRelease(&CheckpointerShmem->ckpt_lck);

			ckpt_active = false;
		}

		/*
		 * Now return to normal top-level context and clear ErrorContext for
		 * next time.
		 */
		MemoryContextSwitchTo(checkpointer_context);
		FlushErrorState();

		/* Flush any leaked data in the top-level context */
		MemoryContextResetAndDeleteChildren(checkpointer_context);

		/* Now we can allow interrupts again */
		RESUME_INTERRUPTS();

		/*
		 * Sleep at least 1 second after any error.  A write error is likely
		 * to be repeated, and we don't want to be filling the error logs as
		 * fast as we can.
		 */
		pg_usleep(1000000L);

		/*
		 * Close all open files after any error.  This is helpful on Windows,
		 * where holding deleted files open causes various strange errors.
		 * It's not clear we need it elsewhere, but shouldn't hurt.
		 */
		smgrcloseall();
	}

	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

	/*
	 * Ensure all shared memory values are set correctly for the config. Doing
	 * this here ensures no race conditions from other concurrent updaters.
	 */
	UpdateSharedMemoryConfig();

	/*
	 * Advertise our latch that backends can use to wake us up while we're
	 * sleeping.
	 */
	ProcGlobal->checkpointerLatch = &MyProc->procLatch;

	/*
	 * Loop forever
	 */
	for (;;)
	{
		bool		do_checkpoint = false;
		int			flags = 0;
		pg_time_t	now;
		int			elapsed_secs;
		int			cur_timeout;
		int			rc;

		/* Clear any already-pending wakeups */
		ResetLatch(MyLatch);

		/*
		 * Process any requests or signals received recently.
		 */
		AbsorbFsyncRequests();

		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);

			/*
			 * Checkpointer is the last process to shut down, so we ask it to
			 * hold the keys for a range of other tasks required most of which
			 * have nothing to do with checkpointing at all.
			 *
			 * For various reasons, some config values can change dynamically
			 * so the primary copy of them is held in shared memory to make
			 * sure all backends see the same value.  We make Checkpointer
			 * responsible for updating the shared memory copy if the
			 * parameter setting changes because of SIGHUP.
			 */
			UpdateSharedMemoryConfig();
		}
		if (checkpoint_requested)
		{
			checkpoint_requested = false;
			do_checkpoint = true;
			BgWriterStats.m_requested_checkpoints++;
		}
		if (shutdown_requested)
		{
			/*
			 * From here on, elog(ERROR) should end with exit(1), not send
			 * control back to the sigsetjmp block above
			 */
			ExitOnAnyError = true;
			/* Close down the database */
			ShutdownXLOG(0, 0);
			/* Normal exit from the checkpointer is here */
			proc_exit(0);		/* done */
		}

		/*
		 * Force a checkpoint if too much time has elapsed since the last one.
		 * Note that we count a timed checkpoint in stats only when this
		 * occurs without an external request, but we set the CAUSE_TIME flag
		 * bit even if there is also an external request.
		 */
		now = (pg_time_t) time(NULL);
		elapsed_secs = now - last_checkpoint_time;
		if (elapsed_secs >= CheckPointTimeout)
		{
			if (!do_checkpoint)
				BgWriterStats.m_timed_checkpoints++;
			do_checkpoint = true;
			flags |= CHECKPOINT_CAUSE_TIME;
		}

		/*
		 * Do a checkpoint if requested.
		 */
		if (do_checkpoint)
		{
			bool		ckpt_performed = false;
			bool		do_restartpoint;

			/*
			 * Check if we should perform a checkpoint or a restartpoint. As a
			 * side-effect, RecoveryInProgress() initializes TimeLineID if
			 * it's not set yet.
			 */
			do_restartpoint = RecoveryInProgress();

			/*
			 * Atomically fetch the request flags to figure out what kind of a
			 * checkpoint we should perform, and increase the started-counter
			 * to acknowledge that we've started a new checkpoint.
			 */
			SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
			flags |= CheckpointerShmem->ckpt_flags;
			CheckpointerShmem->ckpt_flags = 0;
			CheckpointerShmem->ckpt_started++;
			SpinLockRelease(&CheckpointerShmem->ckpt_lck);

			/*
			 * The end-of-recovery checkpoint is a real checkpoint that's
			 * performed while we're still in recovery.
			 */
			if (flags & CHECKPOINT_END_OF_RECOVERY)
				do_restartpoint = false;

			/*
			 * We will warn if (a) too soon since last checkpoint (whatever
			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
			 * since the last checkpoint start.  Note in particular that this
			 * implementation will not generate warnings caused by
			 * CheckPointTimeout < CheckPointWarning.
			 */
			if (!do_restartpoint &&
				(flags & CHECKPOINT_CAUSE_XLOG) &&
				elapsed_secs < CheckPointWarning)
				ereport(LOG,
						(errmsg_plural("checkpoints are occurring too frequently (%d second apart)",
				"checkpoints are occurring too frequently (%d seconds apart)",
									   elapsed_secs,
									   elapsed_secs),
						 errhint("Consider increasing the configuration parameter \"max_wal_size\".")));

			/*
			 * Initialize checkpointer-private variables used during
			 * checkpoint.
			 */
			ckpt_active = true;
			if (do_restartpoint)
				ckpt_start_recptr = GetXLogReplayRecPtr(NULL);
			else
				ckpt_start_recptr = GetInsertRecPtr();
			ckpt_start_time = now;
			ckpt_cached_elapsed = 0;

			/*
			 * Do the checkpoint.
			 */
			if (!do_restartpoint)
			{
				CreateCheckPoint(flags);
				ckpt_performed = true;
			}
			else
				ckpt_performed = CreateRestartPoint(flags);

			/*
			 * After any checkpoint, close all smgr files.  This is so we
			 * won't hang onto smgr references to deleted files indefinitely.
			 */
			smgrcloseall();

			/*
			 * Indicate checkpoint completion to any waiting backends.
			 */
			SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
			CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
			SpinLockRelease(&CheckpointerShmem->ckpt_lck);

			if (ckpt_performed)
			{
				/*
				 * Note we record the checkpoint start time not end time as
				 * last_checkpoint_time.  This is so that time-driven
				 * checkpoints happen at a predictable spacing.
				 */
				last_checkpoint_time = now;
			}
			else
			{
				/*
				 * We were not able to perform the restartpoint (checkpoints
				 * throw an ERROR in case of error).  Most likely because we
				 * have not received any new checkpoint WAL records since the
				 * last restartpoint. Try again in 15 s.
				 */
				last_checkpoint_time = now - CheckPointTimeout + 15;
			}

			ckpt_active = false;
		}

		/* Check for archive_timeout and switch xlog files if necessary. */
		CheckArchiveTimeout();

		/*
		 * Send off activity statistics to the stats collector.  (The reason
		 * why we re-use bgwriter-related code for this is that the bgwriter
		 * and checkpointer used to be just one process.  It's probably not
		 * worth the trouble to split the stats support into two independent
		 * stats message types.)
		 */
		pgstat_send_bgwriter();

		/*
		 * Sleep until we are signaled or it's time for another checkpoint or
		 * xlog file switch.
		 */
		now = (pg_time_t) time(NULL);
		elapsed_secs = now - last_checkpoint_time;
		if (elapsed_secs >= CheckPointTimeout)
			continue;			/* no sleep for us ... */
		cur_timeout = CheckPointTimeout - elapsed_secs;
		if (XLogArchiveTimeout > 0 && !RecoveryInProgress())
		{
			elapsed_secs = now - last_xlog_switch_time;
			if (elapsed_secs >= XLogArchiveTimeout)
				continue;		/* no sleep for us ... */
			cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs);
		}

		rc = WaitLatch(MyLatch,
					   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
					   cur_timeout * 1000L /* convert to ms */,
					   WAIT_EVENT_CHECKPOINTER_MAIN);

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (rc & WL_POSTMASTER_DEATH)
			exit(1);
	}
}
Example #9
0
void
CdbCheckDispatchResult_internal(struct CdbDispatcherState *ds,
								struct SegmentDatabaseDescriptor ***failedSegDB,
								int *numOfFailed, DispatchWaitMode waitMode)
{
	int	i;
	int	j;
	int	nFailed = 0;
	DispatchCommandParms *pParms;
	CdbDispatchResult *dispatchResult;
	SegmentDatabaseDescriptor *segdbDesc;

	Assert(ds != NULL);

	if (failedSegDB)
		*failedSegDB = NULL;
	if (numOfFailed)
		*numOfFailed = 0;

	/*
	 * No-op if no work was dispatched since the last time we were called.
	 */
	if (!ds->dispatchThreads || ds->dispatchThreads->threadCount == 0)
	{
		elog(DEBUG5, "CheckDispatchResult: no threads active");
		return;
	}

	/*
	 * Wait for threads to finish.
	 */
	for (i = 0; i < ds->dispatchThreads->threadCount; i++)
	{
		pParms = &ds->dispatchThreads->dispatchCommandParmsAr[i];
		Assert(pParms != NULL);

		/*
		 * Does caller want to stop short?
		 */
		switch (waitMode)
		{
			case DISPATCH_WAIT_CANCEL:
			case DISPATCH_WAIT_FINISH:
				pParms->waitMode = waitMode;
				break;
			default:
				break;
		}

		if (gp_connections_per_thread == 0)
		{
			thread_DispatchWait(pParms);
		}
		else
		{
			elog(DEBUG4, "CheckDispatchResult: Joining to thread %d of %d",
				 i + 1, ds->dispatchThreads->threadCount);

			if (pParms->thread_valid)
			{
				int			pthread_err = 0;

				pthread_err = pthread_join(pParms->thread, NULL);
				if (pthread_err != 0)
					elog(FATAL,
						 "CheckDispatchResult: pthread_join failed on thread %d (%lu) of %d (returned %d attempting to join to %lu)",
						 i + 1,
#ifndef _WIN32
						 (unsigned long) pParms->thread,
#else
						 (unsigned long) pParms->thread.p,
#endif
						 ds->dispatchThreads->threadCount, pthread_err,
						 (unsigned long) mythread());
			}
		}
		HOLD_INTERRUPTS();
		pParms->thread_valid = false;
		MemSet(&pParms->thread, 0, sizeof(pParms->thread));
		RESUME_INTERRUPTS();

		/*
		 * Examine the CdbDispatchResult objects containing the results
		 * from this thread's QEs.
		 */
		for (j = 0; j < pParms->db_count; j++)
		{
			dispatchResult = pParms->dispatchResultPtrArray[j];

			if (dispatchResult == NULL)
			{
				elog(LOG, "CheckDispatchResult: result object is NULL ? skipping.");
				continue;
			}

			if (dispatchResult->segdbDesc == NULL)
			{
				elog(LOG, "CheckDispatchResult: result object segment descriptor is NULL ? skipping.");
				continue;
			}

			segdbDesc = dispatchResult->segdbDesc;

			/*
			 * segdbDesc error message is unlikely here, but check anyway.
			 */
			if (segdbDesc->errcode || segdbDesc->error_message.len)
				cdbdisp_mergeConnectionErrors(dispatchResult, segdbDesc);

			/*
			 * Log the result
			 */
			if (DEBUG2 >= log_min_messages)
				cdbdisp_debugDispatchResult(dispatchResult, DEBUG2, DEBUG3);

			/*
			 * Notify FTS to reconnect if connection lost or never connected.
			 */
			if (failedSegDB && PQstatus(segdbDesc->conn) == CONNECTION_BAD)
			{
				/*
				 * Allocate storage.  Caller should pfree() it.
				 */
				if (!*failedSegDB)
					*failedSegDB = palloc(sizeof(**failedSegDB) * (2 * getgpsegmentCount() + 1));

				/*
				 * Append to broken connection list.
				 */
				(*failedSegDB)[nFailed++] = segdbDesc;
				(*failedSegDB)[nFailed] = NULL;

				if (numOfFailed)
					*numOfFailed = nFailed;
			}

			/*
			 * Zap our SegmentDatabaseDescriptor ptr because it may be
			 * invalidated by the call to FtsHandleNetFailure() below.
			 * Anything we need from there, we should get before this.
			 */
			dispatchResult->segdbDesc = NULL;

		}
	}

	/*
	 * reset thread state (will be destroyed later on in finishCommand)
	 */
	ds->dispatchThreads->threadCount = 0;

	/*
	 * It looks like everything went fine, make sure we don't miss a
	 * user cancellation?
	 *
	 * The waitMode argument is NONE when we are doing "normal work".
	 */
	if (waitMode == DISPATCH_WAIT_NONE || waitMode == DISPATCH_WAIT_FINISH)
		CHECK_FOR_INTERRUPTS();
}
Example #10
0
/*
 * master_create_worker_shards creates empty shards for the given table based
 * on the specified number of initial shards. The function first gets a list of
 * candidate nodes and issues DDL commands on the nodes to create empty shard
 * placements on those nodes. The function then updates metadata on the master
 * node to make this shard (and its placements) visible. Note that the function
 * assumes the table is hash partitioned and calculates the min/max hash token
 * ranges for each shard, giving them an equal split of the hash space.
 */
Datum
master_create_worker_shards(PG_FUNCTION_ARGS)
{
	text *tableNameText = PG_GETARG_TEXT_P(0);
	int32 shardCount = PG_GETARG_INT32(1);
	int32 replicationFactor = PG_GETARG_INT32(2);

	Oid distributedTableId = ResolveRelationId(tableNameText);
	char relationKind = get_rel_relkind(distributedTableId);
	char *tableName = text_to_cstring(tableNameText);
	char shardStorageType = '\0';
	int32 shardIndex = 0;
	List *workerNodeList = NIL;
	List *ddlCommandList = NIL;
	int32 workerNodeCount = 0;
	uint32 placementAttemptCount = 0;
	uint32 hashTokenIncrement = 0;
	List *existingShardList = NIL;

	/* make sure table is hash partitioned */
	CheckHashPartitionedTable(distributedTableId);

	/* validate that shards haven't already been created for this table */
	existingShardList = LoadShardIntervalList(distributedTableId);
	if (existingShardList != NIL)
	{
		ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
						errmsg("table \"%s\" has already had shards created for it",
							   tableName)));
	}

	/* make sure that at least one shard is specified */
	if (shardCount <= 0)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("shardCount must be positive")));
	}

	/* make sure that at least one replica is specified */
	if (replicationFactor <= 0)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("replicationFactor must be positive")));
	}

	/* calculate the split of the hash space */
	hashTokenIncrement = UINT_MAX / shardCount;

	/* load and sort the worker node list for deterministic placement */
	workerNodeList = ParseWorkerNodeFile(WORKER_LIST_FILENAME);
	workerNodeList = SortList(workerNodeList, CompareWorkerNodes);

	/* make sure we don't process cancel signals until all shards are created */
	HOLD_INTERRUPTS();

	/* retrieve the DDL commands for the table */
	ddlCommandList = TableDDLCommandList(distributedTableId);

	workerNodeCount = list_length(workerNodeList);
	if (replicationFactor > workerNodeCount)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("replicationFactor (%d) exceeds number of worker nodes "
							   "(%d)", replicationFactor, workerNodeCount),
						errhint("Add more worker nodes or try again with a lower "
								"replication factor.")));
	}

	/* if we have enough nodes, add an extra placement attempt for backup */
	placementAttemptCount = (uint32) replicationFactor;
	if (workerNodeCount > replicationFactor)
	{
		placementAttemptCount++;
	}

	/* set shard storage type according to relation type */
	if (relationKind == RELKIND_FOREIGN_TABLE)
	{
		shardStorageType = SHARD_STORAGE_FOREIGN;
	}
	else
	{
		shardStorageType = SHARD_STORAGE_TABLE;
	}

	for (shardIndex = 0; shardIndex < shardCount; shardIndex++)
	{
		uint64 shardId = NextSequenceId(SHARD_ID_SEQUENCE_NAME);
		int32 placementCount = 0;
		uint32 placementIndex = 0;
		uint32 roundRobinNodeIndex = shardIndex % workerNodeCount;

		List *extendedDDLCommands = ExtendedDDLCommandList(distributedTableId, shardId,
														   ddlCommandList);

		/* initialize the hash token space for this shard */
		text *minHashTokenText = NULL;
		text *maxHashTokenText = NULL;
		int32 shardMinHashToken = INT_MIN + (shardIndex * hashTokenIncrement);
		int32 shardMaxHashToken = shardMinHashToken + hashTokenIncrement - 1;

		/* if we are at the last shard, make sure the max token value is INT_MAX */
		if (shardIndex == (shardCount - 1))
		{
			shardMaxHashToken = INT_MAX;
		}

		for (placementIndex = 0; placementIndex < placementAttemptCount; placementIndex++)
		{
			int32 candidateNodeIndex =
				(roundRobinNodeIndex + placementIndex) % workerNodeCount;
			WorkerNode *candidateNode = (WorkerNode *) list_nth(workerNodeList,
																candidateNodeIndex);
			char *nodeName = candidateNode->nodeName;
			uint32 nodePort = candidateNode->nodePort;

			bool created = ExecuteRemoteCommandList(nodeName, nodePort,
													extendedDDLCommands);
			if (created)
			{
				uint64 shardPlacementId = 0;
				ShardState shardState = STATE_FINALIZED;


				shardPlacementId = NextSequenceId(SHARD_PLACEMENT_ID_SEQUENCE_NAME);
				InsertShardPlacementRow(shardPlacementId, shardId, shardState,
										nodeName, nodePort);
				placementCount++;
			}
			else
			{
				ereport(WARNING, (errmsg("could not create shard on \"%s:%u\"",
										 nodeName, nodePort)));
			}

			if (placementCount >= replicationFactor)
			{
				break;
			}
		}

		/* check if we created enough shard replicas */
		if (placementCount < replicationFactor)
		{
			ereport(ERROR, (errmsg("could not satisfy specified replication factor"),
							errdetail("Created %d shard replicas, less than the "
									  "requested replication factor of %d.",
									  placementCount, replicationFactor)));
		}

		/* insert the shard metadata row along with its min/max values */
		minHashTokenText = IntegerToText(shardMinHashToken);
		maxHashTokenText = IntegerToText(shardMaxHashToken);
		InsertShardRow(distributedTableId, shardId, shardStorageType,
					   minHashTokenText, maxHashTokenText);
	}

	if (QueryCancelPending)
	{
		ereport(WARNING, (errmsg("cancel requests are ignored during shard creation")));
		QueryCancelPending = false;
	}

	RESUME_INTERRUPTS();

	PG_RETURN_VOID();
}
Example #11
0
/*
 * AutoVacMain
 */
NON_EXEC_STATIC void
AutoVacMain(int argc, char *argv[])
{
    ListCell   *cell;
    List	   *dblist;
    autovac_dbase *db;
    TransactionId xidForceLimit;
    bool		for_xid_wrap;
    sigjmp_buf	local_sigjmp_buf;

    /* we are a postmaster subprocess now */
    IsUnderPostmaster = true;
    am_autovacuum = true;

    /* MPP-4990: Autovacuum always runs as utility-mode */
    Gp_role = GP_ROLE_UTILITY;

    /* reset MyProcPid */
    MyProcPid = getpid();

    /* record Start Time for logging */
    MyStartTime = time(NULL);

    /* Identify myself via ps */
    init_ps_display("autovacuum process", "", "", "");

    SetProcessingMode(InitProcessing);

    /*
     * If possible, make this process a group leader, so that the postmaster
     * can signal any child processes too.  (autovacuum probably never has
     * any child processes, but for consistency we make all postmaster
     * child processes do this.)
     */
#ifdef HAVE_SETSID
    if (setsid() < 0)
        elog(FATAL, "setsid() failed: %m");
#endif

    /*
     * Set up signal handlers.	We operate on databases much like a regular
     * backend, so we use the same signal handling.  See equivalent code in
     * tcop/postgres.c.
     *
     * Currently, we don't pay attention to postgresql.conf changes that
     * happen during a single daemon iteration, so we can ignore SIGHUP.
     */
    pqsignal(SIGHUP, SIG_IGN);

    /*
     * SIGINT is used to signal cancelling the current table's vacuum; SIGTERM
     * means abort and exit cleanly, and SIGQUIT means abandon ship.
     */
    pqsignal(SIGINT, StatementCancelHandler);
    pqsignal(SIGTERM, die);
    pqsignal(SIGQUIT, quickdie);
    pqsignal(SIGALRM, handle_sig_alarm);

    pqsignal(SIGPIPE, SIG_IGN);
    pqsignal(SIGUSR1, procsignal_sigusr1_handler);
    /* We don't listen for async notifies */
    pqsignal(SIGUSR2, SIG_IGN);
    pqsignal(SIGFPE, FloatExceptionHandler);
    pqsignal(SIGCHLD, SIG_DFL);

    /* Early initialization */
    BaseInit();

    /*
     * Create a per-backend PGPROC struct in shared memory, except in the
     * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
     * this before we can use LWLocks (and in the EXEC_BACKEND case we already
     * had to do some stuff with LWLocks).
     */
#ifndef EXEC_BACKEND
    InitProcess();
#endif

    /*
     * If an exception is encountered, processing resumes here.
     *
     * See notes in postgres.c about the design of this coding.
     */
    if (sigsetjmp(local_sigjmp_buf, 1) != 0)
    {
        /* Prevents interrupts while cleaning up */
        HOLD_INTERRUPTS();

        /* Report the error to the server log */
        EmitErrorReport();

        /*
         * We can now go away.	Note that because we called InitProcess, a
         * callback was registered to do ProcKill, which will clean up
         * necessary state.
         */
        proc_exit(0);
    }

    /* We can now handle ereport(ERROR) */
    PG_exception_stack = &local_sigjmp_buf;

    PG_SETMASK(&UnBlockSig);

    /*
     * Force zero_damaged_pages OFF in the autovac process, even if it is set
     * in postgresql.conf.	We don't really want such a dangerous option being
     * applied non-interactively.
     */
    SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);

    /* Get a list of databases */
    dblist = autovac_get_database_list();

    /*
     * Determine the oldest datfrozenxid/relfrozenxid that we will allow
     * to pass without forcing a vacuum.  (This limit can be tightened for
     * particular tables, but not loosened.)
     */
    recentXid = ReadNewTransactionId();
    xidForceLimit = recentXid - autovacuum_freeze_max_age;
    /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
    if (xidForceLimit < FirstNormalTransactionId)
        xidForceLimit -= FirstNormalTransactionId;

    /*
     * Choose a database to connect to.  We pick the database that was least
     * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
     * wraparound-related data loss.  If any db at risk of wraparound is
     * found, we pick the one with oldest datfrozenxid,
     * independently of autovacuum times.
     *
     * Note that a database with no stats entry is not considered, except for
     * Xid wraparound purposes.  The theory is that if no one has ever
     * connected to it since the stats were last initialized, it doesn't need
     * vacuuming.
     *
     * XXX This could be improved if we had more info about whether it needs
     * vacuuming before connecting to it.  Perhaps look through the pgstats
     * data for the database's tables?  One idea is to keep track of the
     * number of new and dead tuples per database in pgstats.  However it
     * isn't clear how to construct a metric that measures that and not cause
     * starvation for less busy databases.
     */
    db = NULL;
    for_xid_wrap = false;
    foreach(cell, dblist)
    {
        autovac_dbase *tmp = lfirst(cell);

        /* Find pgstat entry if any */
        tmp->entry = pgstat_fetch_stat_dbentry(tmp->oid);

        /* Check to see if this one is at risk of wraparound */
        if (TransactionIdPrecedes(tmp->frozenxid, xidForceLimit))
        {
            if (db == NULL ||
                    TransactionIdPrecedes(tmp->frozenxid, db->frozenxid))
                db = tmp;
            for_xid_wrap = true;
            continue;
        }
        else if (for_xid_wrap)
            continue;			/* ignore not-at-risk DBs */

        /*
         * Otherwise, skip a database with no pgstat entry; it means it
         * hasn't seen any activity.
         */
        if (!tmp->entry)
            continue;

        /*
         * Remember the db with oldest autovac time.  (If we are here,
         * both tmp->entry and db->entry must be non-null.)
         */
        if (db == NULL ||
                tmp->entry->last_autovac_time < db->entry->last_autovac_time)
            db = tmp;
    }
Example #12
0
/*
 * Workfile-manager specific function to clean up before releasing a
 * workfile set from the cache.
 *
 */
static void
workfile_mgr_cleanup_set(const void *resource)
{
	workfile_set *work_set = (workfile_set *) resource;

	/*
	 *  We have to make this callback function return cleanly ALL the
	 *  time. It shouldn't throw an exception.
	 *  We must try to clean up as much as we can in the callback, and
	 *  then never be called again.
	 *  This means holding interrupts, catching and handling all exceptions.
	 */

	if (work_set->on_disk)
	{
		ereport(gp_workfile_caching_loglevel,
				(errmsg("workfile mgr cleanup deleting set: key=0x%0xd, size=" INT64_FORMAT
				" in_progress_size=" INT64_FORMAT " path=%s",
				work_set->key,
				work_set->size,
				work_set->in_progress_size,
				work_set->path),
				errprintstack(true)));

		Assert(NULL == work_set->set_plan);

		PG_TRY();
		{

#ifdef FAULT_INJECTOR
			FaultInjector_InjectFaultIfSet(
				WorkfileCleanupSet,
				DDLNotSpecified,
				"", /* databaseName */
				"" /* tableName */
				);
#endif

			/* Prevent interrupts while cleaning up */
			HOLD_INTERRUPTS();

			workfile_mgr_delete_set_directory(work_set->path);

			/* Now we can allow interrupts again */
			RESUME_INTERRUPTS();

		}
		PG_CATCH();
		{
			elog(LOG, "Cleaning up workfile set directory path=%s failed. Proceeding",
					work_set->path);

			/* We're not re-throwing the error. Otherwise we'll end up having
			 * to clean up again, probably failing again.
			 */
		}
		PG_END_TRY();

		/*
		 * The most accurate size of a workset is recorded in work_set->in_progress_size.
		 * work_set->size is only updated when we close a file, so it lags behind
		 */

		Assert(work_set->in_progress_size >= work_set->size);
		int64 size_to_delete = work_set->in_progress_size;

		elog(gp_workfile_caching_loglevel, "Subtracting " INT64_FORMAT " from workfile diskspace", size_to_delete);

		/*
		 * When subtracting the size of this workset from our accounting,
		 * only update the per-query counter if we created the workset.
		 * In that case, the state is ACQUIRED, otherwise is CACHED or DELETED
		 */
		CacheEntry *cacheEntry = CACHE_ENTRY_HEADER(resource);
		bool update_query_space = (cacheEntry->state == CACHE_ENTRY_ACQUIRED);

		WorkfileDiskspace_Commit(0, size_to_delete, update_query_space);
	}
	else
	{
		/* Non-physical workfile set, we need to free up the plan memory */
		if (NULL != work_set->set_plan->serialized_plan)
		{
			pfree(work_set->set_plan->serialized_plan);
		}

		if (NULL != work_set->set_plan)
		{
			pfree(work_set->set_plan);
		}
	}
}
Example #13
0
/*
 * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
 *
 * If the lock is not available, return FALSE with no side-effects.
 *
 * If successful, cancel/die interrupts are held off until lock release.
 */
bool
LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
{
	volatile LWLock *lock = &(LWLockArray[lockid].lock);
#if LWLOCK_LOCK_PARTS > 1
	volatile LWLockPart *part = LWLOCK_PART(lock, lockid, MyBackendId);
#endif
	bool		mustwait;

	PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, lock);

	/* Ensure we will have room to remember the lock */
	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
		elog(ERROR, "too many LWLocks taken");

	/*
	 * Lock out cancel/die interrupts until we exit the code section protected
	 * by the LWLock.  This ensures that interrupts will not interfere with
	 * manipulations of data structures in shared memory.
	 */
	HOLD_INTERRUPTS();

	if (mode == LW_SHARED)
	{
#ifdef LWLOCK_PART_SHARED_OPS_ATOMIC
		/* Increment shared counter partition. If there's no contention,
		 * this is sufficient to take the lock
		 */
		LWLOCK_PART_SHARED_POSTINC_ATOMIC(lock, lockid, part, MyBackendId);
		LWLOCK_PART_SHARED_FENCE();

		/* A concurrent exclusive locking attempt does the following
		 * three steps
		 *   1) Acquire mutex
		 *   2) Check shared counter partitions for readers.
		 *   3a) If found add proc to wait queue, block, restart at (1)
		 *   3b) If not found, set exclusive flag, continue with (4)
		 *   4) Enter protected section
		 * The fence after the atomic add above ensures that no further
		 * such attempt can proceed to (3b) or beyond. There may be
		 * pre-existing exclusive locking attempts at step (3b) or beyond,
		 * but we can recognize those by either the mutex being taken, or
		 * the exclusive flag being set. Conversely, if we see neither, we
		 * may proceed and enter the protected section.
		 *
		 * FIXME: This doesn't work if slock_t is a struct or doesn't
		 * use 0 for state "unlocked".
		 */

		if ((lock->mutex == 0) && (lock->exclusive == 0))
			goto lock_acquired;

		/* At this point, we don't know if the concurrent exclusive locker
		 * has proceeded to (3b) or blocked. We must take the mutex and
		 * re-check
		 */
#endif /* LWLOCK_PART_SHARED_OPS_ATOMIC */

		/* Acquire mutex.  Time spent holding mutex should be short! */
		SpinLockAcquire(&lock->mutex);

		if (lock->exclusive == 0)
		{
#ifdef LWLOCK_PART_SHARED_OPS_ATOMIC
			/* Already incremented the shared counter partition above */
#else
			lock->shared++;
#endif
			mustwait = false;
		}
		else
		{
#ifdef LWLOCK_PART_SHARED_OPS_ATOMIC
			/* Must undo shared counter partition increment. Note that
			 * we *need* to do that while holding the mutex. Otherwise,
			 * the exclusive lock could be released and attempted to be
			 * re-acquired before we undo the increment. That attempt
			 * would then block, even though there'd be no lock holder
			 * left
			 */
			LWLOCK_PART_SHARED_POSTDEC_ATOMIC(lock, lockid, part, MyBackendId);
#endif
			mustwait = true;
		}
	}
	else
	{
		/* Step (1). Acquire mutex. Time spent holding mutex should be
		 *                          short!
		 */
		SpinLockAcquire(&lock->mutex);

		if (lock->exclusive == 0)
		{
			/* Step (2). Check for shared lockers. This surely happens
			 * after (1), otherwise SpinLockAcquire() is broken. Lock
			 * acquire semantics demand that no load must be re-ordered
			 * from after a lock acquisition to before, for obvious
			 * reasons.
			 */

			LWLOCK_IS_SHARED(mustwait, lock, lockid);

			if (!mustwait) {
				/* Step (3a). Set exclusive flag. This surely happens
				 * after (2) because it depends on the result of (2),
				 * no matter how much reordering is going on here.
				 */
				lock->exclusive++;
			}
		}
		else
			mustwait = true;
	}

	/* We are done updating shared state of the lock itself. */
	SpinLockRelease(&lock->mutex);

	if (mustwait)
	{
		/* Failed to get lock, so release interrupt holdoff */
		RESUME_INTERRUPTS();
		LOG_LWDEBUG("LWLockConditionalAcquire", lockid, "failed");
		TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(lockid, mode);
		
		return false;
	}

#ifdef LWLOCK_PART_SHARED_OPS_ATOMIC
lock_acquired:
#endif

	TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(lockid, mode);

	/* Add lock to list of locks held by this backend */
	held_lwlocks[num_held_lwlocks] = lockid;
	held_lwlocks_mode[num_held_lwlocks] = mode;
	++num_held_lwlocks;

	return true;
}
Example #14
0
/*
 * Main entry point for bgwriter process
 *
 * This is invoked from BootstrapMain, which has already created the basic
 * execution environment, but not enabled signals yet.
 */
void
BackgroundWriterMain(void)
{
	sigjmp_buf	local_sigjmp_buf;
	MemoryContext bgwriter_context;

	BgWriterShmem->bgwriter_pid = MyProcPid;
	am_bg_writer = true;

	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too.	(bgwriter probably never has any
	 * child processes, but for consistency we make all postmaster child
	 * processes do this.)
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
	 * Properly accept or ignore signals the postmaster might send us
	 *
	 * Note: we deliberately ignore SIGTERM, because during a standard Unix
	 * system shutdown cycle, init will SIGTERM all processes at once.	We
	 * want to wait for the backends to exit, whereupon the postmaster will
	 * tell us it's okay to shut down (via SIGUSR2).
	 *
	 * SIGUSR1 is presently unused; keep it spare in case someday we want this
	 * process to participate in sinval messaging.
	 */
	pqsignal(SIGHUP, BgSigHupHandler);	/* set flag to read config file */
	pqsignal(SIGINT, ReqCheckpointHandler);		/* request checkpoint */
	pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
	pqsignal(SIGQUIT, bg_quickdie);		/* hard crash time */
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, SIG_IGN); /* reserve for sinval */
	pqsignal(SIGUSR2, ReqShutdownHandler);		/* request shutdown */

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/* We allow SIGQUIT (quickdie) at all times */
#ifdef HAVE_SIGPROCMASK
	sigdelset(&BlockSig, SIGQUIT);
#else
	BlockSig &= ~(sigmask(SIGQUIT));
#endif

	/*
	 * Initialize so that first time-driven event happens at the correct time.
	 */
	last_checkpoint_time = last_xlog_switch_time = time(NULL);

	/*
	 * Create a resource owner to keep track of our resources (currently only
	 * buffer pins).
	 */
	CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer");

	/*
	 * Create a memory context that we will do all our work in.  We do this so
	 * that we can reset the context during error recovery and thereby avoid
	 * possible memory leaks.  Formerly this code just ran in
	 * TopMemoryContext, but resetting that would be a really bad idea.
	 */
	bgwriter_context = AllocSetContextCreate(TopMemoryContext,
											 "Background Writer",
											 ALLOCSET_DEFAULT_MINSIZE,
											 ALLOCSET_DEFAULT_INITSIZE,
											 ALLOCSET_DEFAULT_MAXSIZE);
	MemoryContextSwitchTo(bgwriter_context);

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * See notes in postgres.c about the design of this coding.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* Since not using PG_TRY, must reset error stack by hand */
		error_context_stack = NULL;

		/* Prevent interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Report the error to the server log */
		EmitErrorReport();

		/*
		 * These operations are really just a minimal subset of
		 * AbortTransaction().	We don't have very many resources to worry
		 * about in bgwriter, but we do have LWLocks, buffers, and temp files.
		 */
		LWLockReleaseAll();
		AbortBufferIO();
		UnlockBuffers();
		/* buffer pins are released here: */
		ResourceOwnerRelease(CurrentResourceOwner,
							 RESOURCE_RELEASE_BEFORE_LOCKS,
							 false, true);
		/* we needn't bother with the other ResourceOwnerRelease phases */
		AtEOXact_Buffers(false);
		AtEOXact_Files();
		AtEOXact_HashTables(false);

		/* Warn any waiting backends that the checkpoint failed. */
		if (ckpt_active)
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile BgWriterShmemStruct *bgs = BgWriterShmem;

			SpinLockAcquire(&bgs->ckpt_lck);
			bgs->ckpt_failed++;
			bgs->ckpt_done = bgs->ckpt_started;
			SpinLockRelease(&bgs->ckpt_lck);

			ckpt_active = false;
		}

		/*
		 * Now return to normal top-level context and clear ErrorContext for
		 * next time.
		 */
		MemoryContextSwitchTo(bgwriter_context);
		FlushErrorState();

		/* Flush any leaked data in the top-level context */
		MemoryContextResetAndDeleteChildren(bgwriter_context);

		/* Now we can allow interrupts again */
		RESUME_INTERRUPTS();

		/*
		 * Sleep at least 1 second after any error.  A write error is likely
		 * to be repeated, and we don't want to be filling the error logs as
		 * fast as we can.
		 */
		pg_usleep(1000000L);

		/*
		 * Close all open files after any error.  This is helpful on Windows,
		 * where holding deleted files open causes various strange errors.
		 * It's not clear we need it elsewhere, but shouldn't hurt.
		 */
		smgrcloseall();
	}

	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

	/*
	 * Loop forever
	 */
	for (;;)
	{
		bool		do_checkpoint = false;
		int			flags = 0;
		time_t		now;
		int			elapsed_secs;

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (!PostmasterIsAlive(true))
			exit(1);

		/*
		 * Process any requests or signals received recently.
		 */
		AbsorbFsyncRequests();

		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);
		}
		if (checkpoint_requested)
		{
			checkpoint_requested = false;
			do_checkpoint = true;
			BgWriterStats.m_requested_checkpoints++;
		}
		if (shutdown_requested)
		{
			/*
			 * From here on, elog(ERROR) should end with exit(1), not send
			 * control back to the sigsetjmp block above
			 */
			ExitOnAnyError = true;
			/* Close down the database */
			ShutdownXLOG(0, 0);
			DumpFreeSpaceMap(0, 0);
			/* Normal exit from the bgwriter is here */
			proc_exit(0);		/* done */
		}

		/*
		 * Force a checkpoint if too much time has elapsed since the last one.
		 * Note that we count a timed checkpoint in stats only when this
		 * occurs without an external request, but we set the CAUSE_TIME flag
		 * bit even if there is also an external request.
		 */
		now = time(NULL);
		elapsed_secs = now - last_checkpoint_time;
		if (elapsed_secs >= CheckPointTimeout)
		{
			if (!do_checkpoint)
				BgWriterStats.m_timed_checkpoints++;
			do_checkpoint = true;
			flags |= CHECKPOINT_CAUSE_TIME;
		}

		/*
		 * Do a checkpoint if requested, otherwise do one cycle of
		 * dirty-buffer writing.
		 */
		if (do_checkpoint)
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile BgWriterShmemStruct *bgs = BgWriterShmem;

			/*
			 * Atomically fetch the request flags to figure out what kind of a
			 * checkpoint we should perform, and increase the started-counter
			 * to acknowledge that we've started a new checkpoint.
			 */
			SpinLockAcquire(&bgs->ckpt_lck);
			flags |= bgs->ckpt_flags;
			bgs->ckpt_flags = 0;
			bgs->ckpt_started++;
			SpinLockRelease(&bgs->ckpt_lck);

			/*
			 * We will warn if (a) too soon since last checkpoint (whatever
			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
			 * since the last checkpoint start.  Note in particular that this
			 * implementation will not generate warnings caused by
			 * CheckPointTimeout < CheckPointWarning.
			 */
			if ((flags & CHECKPOINT_CAUSE_XLOG) &&
				elapsed_secs < CheckPointWarning)
				ereport(LOG,
						(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
								elapsed_secs),
						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));

			/*
			 * Initialize bgwriter-private variables used during checkpoint.
			 */
			ckpt_active = true;
			ckpt_start_recptr = GetInsertRecPtr();
			ckpt_start_time = now;
			ckpt_cached_elapsed = 0;

			/*
			 * Do the checkpoint.
			 */
			CreateCheckPoint(flags);

			/*
			 * After any checkpoint, close all smgr files.	This is so we
			 * won't hang onto smgr references to deleted files indefinitely.
			 */
			smgrcloseall();

			/*
			 * Indicate checkpoint completion to any waiting backends.
			 */
			SpinLockAcquire(&bgs->ckpt_lck);
			bgs->ckpt_done = bgs->ckpt_started;
			SpinLockRelease(&bgs->ckpt_lck);

			ckpt_active = false;

			/*
			 * Note we record the checkpoint start time not end time as
			 * last_checkpoint_time.  This is so that time-driven checkpoints
			 * happen at a predictable spacing.
			 */
			last_checkpoint_time = now;
		}
		else
			BgBufferSync();

		/* Check for archive_timeout and switch xlog files if necessary. */
		CheckArchiveTimeout();

		/* Nap for the configured time. */
		BgWriterNap();
	}
}
Example #15
0
inline
void *
AbstractionLayer::Allocator::internalAllocate(void *inPtr, const size_t inSize) const {
    // Avoid warning that inPtr is not used if R == NewAllocation
    (void) inPtr;
    
    void *ptr;
    bool errorOccurred = false;
    MemoryContext oldContext = NULL;
    MemoryContext aggContext = NULL;

    if (F == dbal::ReturnNULL) {
        /*
         * HOLD_INTERRUPTS() and RESUME_INTERRUPTS() only change the value of a
         * global variable but have no other side effects. In particular, they
         * do not call CHECK_INTERRUPTS(). Hence, we are save to use these
         * macros outside of a PG_TRY() block.
         */
        HOLD_INTERRUPTS();
    }
        
    PG_TRY(); {
        if (MC == dbal::AggregateContext) {
            if (!AggCheckCallContext(fcinfo, &aggContext))
                errorOccurred = true;
            else {
                oldContext = MemoryContextSwitchTo(aggContext);
                ptr = (R == Reallocation) ? internalRePalloc<ZM>(inPtr, inSize)
                                          : internalPalloc<ZM>(inSize);
                MemoryContextSwitchTo(oldContext);
            }
        } else {
            ptr = R ? internalRePalloc<ZM>(inPtr, inSize)
                    : internalPalloc<ZM>(inSize);
        }
    } PG_CATCH(); {
        if (F == dbal::ReturnNULL) {
            /*
             * This cannot be due to an interrupt, so it's reasonably safe
             * to assume that the PG exception was a pure memory-allocation
             * issue. We ignore the error and flush the error state.
             * Flushing is necessary for leaving the error state (e.g., the memory
             * context is restored).
             */
            FlushErrorState();
            ptr = NULL;
        } else {
            /*
             * PostgreSQL error messages can be stacked. So, it doesn't hurt to add
             * our own message. After unwinding the C++ stack, the PostgreSQL
             * exception will be re-thrown into the PostgreSQL C code.
             *
             * Throwing C++ exceptions inside a PG_CATCH block is not problematic
             * per se, but it is good practise to keep the exception mechanisms clearly
             * separated.
             */

            errorOccurred = true;
        }
    } PG_END_TRY();

    if (errorOccurred) {
        PG_TRY(); {
            // Clean up after ourselves
            if (oldContext != NULL)
                MemoryContextSwitchTo(oldContext);
        } PG_CATCH(); {
            if (F == dbal::ReturnNULL) {
                // We tried to clean up after ourselves. If this fails, we can
                // only ignore the issue.
                FlushErrorState();
            } 
            // Else do nothing. We will add a bad-allocation exception on top of
            // the existing PostgreSQL exception stack.
        } PG_END_TRY();
    }
    
    if (errorOccurred || !ptr)
        // We do not want to interleave PG exceptions and C++ exceptions.
        throw std::bad_alloc();

    if (F == dbal::ReturnNULL) {
        RESUME_INTERRUPTS();
    }
   
    return ptr;
}
Example #16
0
/*
 * Main entry point for walwriter process
 *
 * This is invoked from BootstrapMain, which has already created the basic
 * execution environment, but not enabled signals yet.
 */
void
WalWriterMain(void)
{
	sigjmp_buf	local_sigjmp_buf;
	MemoryContext walwriter_context;

	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too.	(walwriter probably never has any
	 * child processes, but for consistency we make all postmaster child
	 * processes do this.)
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
	 * Properly accept or ignore signals the postmaster might send us
	 *
	 * We have no particular use for SIGINT at the moment, but seems
	 * reasonable to treat like SIGTERM.
	 */
	pqsignal(SIGHUP, WalSigHupHandler); /* set flag to read config file */
	pqsignal(SIGINT, WalShutdownHandler);		/* request shutdown */
	pqsignal(SIGTERM, WalShutdownHandler);		/* request shutdown */
	pqsignal(SIGQUIT, wal_quickdie);	/* hard crash time */
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */
	pqsignal(SIGUSR2, SIG_IGN); /* not used */

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/* We allow SIGQUIT (quickdie) at all times */
	sigdelset(&BlockSig, SIGQUIT);

	/*
	 * Create a resource owner to keep track of our resources (not clear that
	 * we need this, but may as well have one).
	 */
	CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer");

	/*
	 * Create a memory context that we will do all our work in.  We do this so
	 * that we can reset the context during error recovery and thereby avoid
	 * possible memory leaks.  Formerly this code just ran in
	 * TopMemoryContext, but resetting that would be a really bad idea.
	 */
	walwriter_context = AllocSetContextCreate(TopMemoryContext,
											  "Wal Writer",
											  ALLOCSET_DEFAULT_MINSIZE,
											  ALLOCSET_DEFAULT_INITSIZE,
											  ALLOCSET_DEFAULT_MAXSIZE);
	MemoryContextSwitchTo(walwriter_context);

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * This code is heavily based on bgwriter.c, q.v.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* Since not using PG_TRY, must reset error stack by hand */
		error_context_stack = NULL;

		/* Prevent interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Report the error to the server log */
		EmitErrorReport();

		/*
		 * These operations are really just a minimal subset of
		 * AbortTransaction().	We don't have very many resources to worry
		 * about in walwriter, but we do have LWLocks, and perhaps buffers?
		 */
		LWLockReleaseAll();
		AbortBufferIO();
		UnlockBuffers();
		/* buffer pins are released here: */
		ResourceOwnerRelease(CurrentResourceOwner,
							 RESOURCE_RELEASE_BEFORE_LOCKS,
							 false, true);
		/* we needn't bother with the other ResourceOwnerRelease phases */
		AtEOXact_Buffers(false);
		AtEOXact_Files();
		AtEOXact_HashTables(false);

		/*
		 * Now return to normal top-level context and clear ErrorContext for
		 * next time.
		 */
		MemoryContextSwitchTo(walwriter_context);
		FlushErrorState();

		/* Flush any leaked data in the top-level context */
		MemoryContextResetAndDeleteChildren(walwriter_context);

		/* Now we can allow interrupts again */
		RESUME_INTERRUPTS();

		/*
		 * Sleep at least 1 second after any error.  A write error is likely
		 * to be repeated, and we don't want to be filling the error logs as
		 * fast as we can.
		 */
		pg_usleep(1000000L);

		/*
		 * Close all open files after any error.  This is helpful on Windows,
		 * where holding deleted files open causes various strange errors.
		 * It's not clear we need it elsewhere, but shouldn't hurt.
		 */
		smgrcloseall();
	}

	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

	/*
	 * Loop forever
	 */
	for (;;)
	{
		long		udelay;

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (!PostmasterIsAlive(true))
			exit(1);

		/*
		 * Process any requests or signals received recently.
		 */
		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);
		}
		if (shutdown_requested)
		{
			/* Normal exit from the walwriter is here */
			proc_exit(0);		/* done */
		}

		/*
		 * Do what we're here for...
		 */
		XLogBackgroundFlush();

		/*
		 * Delay until time to do something more, but fall out of delay
		 * reasonably quickly if signaled.
		 */
		udelay = WalWriterDelay * 1000L;
		while (udelay > 999999L)
		{
			if (got_SIGHUP || shutdown_requested)
				break;
			pg_usleep(1000000L);
			udelay -= 1000000L;
		}
		if (!(got_SIGHUP || shutdown_requested))
			pg_usleep(udelay);
	}
}
Example #17
0
/*
 *	lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
 *
 *		This routine vacuums a single heap, cleans out its indexes, and
 *		updates its relpages and reltuples statistics.
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 *
 *		The return value indicates whether this function has held off
 *		interrupts -- caller must RESUME_INTERRUPTS() after commit if true.
 */
bool
lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
				BufferAccessStrategy bstrategy, List *updated_stats)
{
	LVRelStats *vacrelstats;
	Relation   *Irel;
	int			nindexes;
	BlockNumber possibly_freeable;
	PGRUsage	ru0;
	TimestampTz starttime = 0;
	bool		heldoff = false;

	pg_rusage_init(&ru0);

	/* measure elapsed time iff autovacuum logging requires it */
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0)
		starttime = GetCurrentTimestamp();

	if (vacstmt->verbose)
		elevel = INFO;
	else
		elevel = DEBUG2;

	if (Gp_role == GP_ROLE_DISPATCH)
		elevel = DEBUG2; /* vacuum and analyze messages aren't interesting from the QD */

#ifdef FAULT_INJECTOR
	if (vacuumStatement_IsInAppendOnlyDropPhase(vacstmt))
	{
			FaultInjector_InjectFaultIfSet(
				CompactionBeforeSegmentFileDropPhase,
				DDLNotSpecified,
				"",	// databaseName
				""); // tableName
	}
	if (vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt))
	{
			FaultInjector_InjectFaultIfSet(
				CompactionBeforeCleanupPhase,
				DDLNotSpecified,
				"",	// databaseName
				""); // tableName
	}
#endif

	/*
	 * MPP-23647.  Update xid limits for heap as well as appendonly
	 * relations.  This allows setting relfrozenxid to correct value
	 * for an appendonly (AO/CO) table.
	 */
	vac_strategy = bstrategy;

	vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
						  &OldestXmin, &FreezeLimit);

	/*
	 * Execute the various vacuum operations. Appendonly tables are treated
	 * differently.
	 */
	if (RelationIsAoRows(onerel) || RelationIsAoCols(onerel))
	{
		lazy_vacuum_aorel(onerel, vacstmt, updated_stats);
		return false;
	}

	vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));

	/* heap relation */

	/* Set threshold for interesting free space = average request size */
	/* XXX should we scale it up or down?  Adjust vacuum.c too, if so */
	vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node);

	vacrelstats->num_index_scans = 0;

	/* Open all indexes of the relation */
	vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
	vacrelstats->hasindex = (nindexes > 0);

	/* Do the vacuuming */
	lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, updated_stats);

	/* Done with indexes */
	vac_close_indexes(nindexes, Irel, NoLock);

	/*
	 * Optionally truncate the relation.
	 *
	 * Don't even think about it unless we have a shot at releasing a goodly
	 * number of pages.  Otherwise, the time taken isn't worth it.
	 *
	 * Note that after we've truncated the heap, it's too late to abort the
	 * transaction; doing so would lose the sinval messages needed to tell
	 * the other backends about the table being shrunk.  We prevent interrupts
	 * in that case; caller is responsible for re-enabling them after
	 * committing the transaction.
	 */
	possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
	if (possibly_freeable > 0 &&
		(possibly_freeable >= REL_TRUNCATE_MINIMUM ||
		 possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
	{
		HOLD_INTERRUPTS();
		heldoff = true;
		lazy_truncate_heap(onerel, vacrelstats);
	}

	/* Update shared free space map with final free space info */
	lazy_update_fsm(onerel, vacrelstats);

	if (vacrelstats->tot_free_pages > MaxFSMPages)
		ereport(WARNING,
				(errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space",
						get_namespace_name(RelationGetNamespace(onerel)),
						RelationGetRelationName(onerel)),
				 /* Only suggest VACUUM FULL if > 20% free */
				 (vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20) ?
				 errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\".") :
				 errhint("Consider increasing the configuration parameter \"max_fsm_pages\".")));

	/* Update statistics in pg_class */
	vac_update_relstats_from_list(onerel,
						vacrelstats->rel_pages,
						vacrelstats->rel_tuples,
						vacrelstats->hasindex,
						FreezeLimit,
						updated_stats);

	/* report results to the stats collector, too */
	pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
						 true /*vacrelstats->scanned_all*/,
						 vacstmt->analyze, vacrelstats->rel_tuples);

	if (gp_indexcheck_vacuum == INDEX_CHECK_ALL ||
		(gp_indexcheck_vacuum == INDEX_CHECK_SYSTEM &&
		 PG_CATALOG_NAMESPACE == RelationGetNamespace(onerel)))
	{
		int			i;

		for (i = 0; i < nindexes; i++)
		{
			if (Irel[i]->rd_rel->relam == BTREE_AM_OID)
				_bt_validate_vacuum(Irel[i], onerel, OldestXmin);
		}
	}

	/* and log the action if appropriate */
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
	{
		if (Log_autovacuum_min_duration == 0 ||
			TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
									   Log_autovacuum_min_duration))
			ereport(LOG,
					(errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
							"pages: %d removed, %d remain\n"
							"tuples: %.0f removed, %.0f remain\n"
							"system usage: %s",
							get_database_name(MyDatabaseId),
							get_namespace_name(RelationGetNamespace(onerel)),
							RelationGetRelationName(onerel),
							vacrelstats->num_index_scans,
						  vacrelstats->pages_removed, vacrelstats->rel_pages,
						vacrelstats->tuples_deleted, vacrelstats->rel_tuples,
							pg_rusage_show(&ru0))));
	}

	return heldoff;
}
Example #18
0
/*
 * master_copy_shard_placement implements a user-facing UDF to copy data from
 * a healthy (source) node to an inactive (target) node. To accomplish this it
 * entirely recreates the table structure before copying all data. During this
 * time all modifications are paused to the shard. After successful repair, the
 * inactive placement is marked healthy and modifications may continue. If the
 * repair fails at any point, this function throws an error, leaving the node
 * in an unhealthy state.
 */
Datum
master_copy_shard_placement(PG_FUNCTION_ARGS)
{
    int64 shardId = PG_GETARG_INT64(0);
    text *sourceNodeName = PG_GETARG_TEXT_P(1);
    int32 sourceNodePort = PG_GETARG_INT32(2);
    text *targetNodeName = PG_GETARG_TEXT_P(3);
    int32 targetNodePort = PG_GETARG_INT32(4);
    ShardInterval *shardInterval = LoadShardInterval(shardId);
    Oid distributedTableId = shardInterval->relationId;

    List *shardPlacementList = NIL;
    ShardPlacement *sourcePlacement = NULL;
    ShardPlacement *targetPlacement = NULL;
    List *ddlCommandList = NIL;
    bool recreated = false;
    bool dataCopied = false;

    /*
     * By taking an exclusive lock on the shard, we both stop all modifications
     * (INSERT, UPDATE, or DELETE) and prevent concurrent repair operations from
     * being able to operate on this shard.
     */
    LockShard(shardId, ExclusiveLock);

    shardPlacementList = LoadShardPlacementList(shardId);

    sourcePlacement = SearchShardPlacementInList(shardPlacementList, sourceNodeName,
                      sourceNodePort);
    if (sourcePlacement->shardState != STATE_FINALIZED)
    {
        ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                        errmsg("source placement must be in finalized state")));
    }

    targetPlacement = SearchShardPlacementInList(shardPlacementList, targetNodeName,
                      targetNodePort);
    if (targetPlacement->shardState != STATE_INACTIVE)
    {
        ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                        errmsg("target placement must be in inactive state")));
    }

    /* retrieve the DDL commands for the table and run them */
    ddlCommandList = RecreateTableDDLCommandList(distributedTableId, shardId);

    recreated = ExecuteRemoteCommandList(targetPlacement->nodeName,
                                         targetPlacement->nodePort,
                                         ddlCommandList);
    if (!recreated)
    {
        ereport(ERROR, (errmsg("could not recreate shard table"),
                        errhint("Consult recent messages in the server logs for "
                                "details.")));
    }

    HOLD_INTERRUPTS();

    dataCopied = CopyDataFromFinalizedPlacement(distributedTableId, shardId,
                 sourcePlacement, targetPlacement);
    if (!dataCopied)
    {
        ereport(ERROR, (errmsg("could not copy shard data"),
                        errhint("Consult recent messages in the server logs for "
                                "details.")));
    }

    /* the placement is repaired, so return to finalized state */
    DeleteShardPlacementRow(targetPlacement->id);
    InsertShardPlacementRow(targetPlacement->id, targetPlacement->shardId,
                            STATE_FINALIZED, targetPlacement->nodeName,
                            targetPlacement->nodePort);

    RESUME_INTERRUPTS();

    PG_RETURN_VOID();
}
Example #19
0
/**
 * This method is called after fork of the sweeper process. It sets up signal
 * handlers and does initialization that is required by a postgres backend.
 */
NON_EXEC_STATIC void
BackoffSweeperMain(int argc, char *argv[])
{
	sigjmp_buf	local_sigjmp_buf;

	IsUnderPostmaster = true;
	isSweeperProcess = true;

	/* Stay away from PMChildSlot */
	MyPMChildSlot = -1;

	/* reset MyProcPid */
	MyProcPid = getpid();

	/* Lose the postmaster's on-exit routines */
	on_exit_reset();

	/* Identify myself via ps */
	init_ps_display("sweeper process", "", "", "");

	SetProcessingMode(InitProcessing);

	/*
	 * Set up signal handlers.  We operate on databases much like a regular
	 * backend, so we use the same signal handling.  See equivalent code in
	 * tcop/postgres.c.
	 */
	pqsignal(SIGHUP, SIG_IGN);
	pqsignal(SIGINT, SIG_IGN);
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, SIG_IGN);

	pqsignal(SIGTERM, die);
	pqsignal(SIGQUIT, quickdie);
	pqsignal(SIGUSR2, BackoffRequestShutdown);

	pqsignal(SIGFPE, FloatExceptionHandler);
	pqsignal(SIGCHLD, SIG_DFL);

	/*
	 * Copied from bgwriter
	 */
	CurrentResourceOwner = ResourceOwnerCreate(NULL, "Sweeper process");

	/* Early initialization */
	BaseInit();

	/* See InitPostgres()... */
	InitProcess();

	SetProcessingMode(NormalProcessing);

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * See notes in postgres.c about the design of this coding.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* Prevents interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Report the error to the server log */
		EmitErrorReport();

		/*
		 * We can now go away.  Note that because we'll call InitProcess, a
		 * callback will be registered to do ProcKill, which will clean up
		 * necessary state.
		 */
		proc_exit(0);
	}

	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	PG_SETMASK(&UnBlockSig);

	MyBackendId = InvalidBackendId;

	/* main loop */
	BackoffSweeperLoop();

	/* One iteration done, go away */
	proc_exit(0);
}
Example #20
0
/*
 * Main entry point for bgwriter process
 *
 * This is invoked from AuxiliaryProcessMain, which has already created the
 * basic execution environment, but not enabled signals yet.
 */
void
BackgroundWriterMain(void)
{
	sigjmp_buf	local_sigjmp_buf;
	MemoryContext bgwriter_context;
	bool		prev_hibernate;

	/*
	 * Properly accept or ignore signals the postmaster might send us.
	 *
	 * bgwriter doesn't participate in ProcSignal signalling, but a SIGUSR1
	 * handler is still needed for latch wakeups.
	 */
	pqsignal(SIGHUP, BgSigHupHandler);	/* set flag to read config file */
	pqsignal(SIGINT, SIG_IGN);
	pqsignal(SIGTERM, ReqShutdownHandler);		/* shutdown */
	pqsignal(SIGQUIT, bg_quickdie);		/* hard crash time */
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, bgwriter_sigusr1_handler);
	pqsignal(SIGUSR2, SIG_IGN);

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/* We allow SIGQUIT (quickdie) at all times */
	sigdelset(&BlockSig, SIGQUIT);

	/*
	 * Create a resource owner to keep track of our resources (currently only
	 * buffer pins).
	 */
	CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer");

	/*
	 * We just started, assume there has been either a shutdown or
	 * end-of-recovery snapshot.
	 */
	last_snapshot_ts = GetCurrentTimestamp();

	/*
	 * Create a memory context that we will do all our work in.  We do this so
	 * that we can reset the context during error recovery and thereby avoid
	 * possible memory leaks.  Formerly this code just ran in
	 * TopMemoryContext, but resetting that would be a really bad idea.
	 */
	bgwriter_context = AllocSetContextCreate(TopMemoryContext,
											 "Background Writer",
											 ALLOCSET_DEFAULT_MINSIZE,
											 ALLOCSET_DEFAULT_INITSIZE,
											 ALLOCSET_DEFAULT_MAXSIZE);
	MemoryContextSwitchTo(bgwriter_context);

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * See notes in postgres.c about the design of this coding.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* Since not using PG_TRY, must reset error stack by hand */
		error_context_stack = NULL;

		/* Prevent interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Report the error to the server log */
		EmitErrorReport();

		/*
		 * These operations are really just a minimal subset of
		 * AbortTransaction().  We don't have very many resources to worry
		 * about in bgwriter, but we do have LWLocks, buffers, and temp files.
		 */
		LWLockReleaseAll();
		AbortBufferIO();
		UnlockBuffers();
		/* buffer pins are released here: */
		ResourceOwnerRelease(CurrentResourceOwner,
							 RESOURCE_RELEASE_BEFORE_LOCKS,
							 false, true);
		/* we needn't bother with the other ResourceOwnerRelease phases */
		AtEOXact_Buffers(false);
		AtEOXact_SMgr();
		AtEOXact_Files();
		AtEOXact_HashTables(false);

		/*
		 * Now return to normal top-level context and clear ErrorContext for
		 * next time.
		 */
		MemoryContextSwitchTo(bgwriter_context);
		FlushErrorState();

		/* Flush any leaked data in the top-level context */
		MemoryContextResetAndDeleteChildren(bgwriter_context);

		/* Now we can allow interrupts again */
		RESUME_INTERRUPTS();

		/*
		 * Sleep at least 1 second after any error.  A write error is likely
		 * to be repeated, and we don't want to be filling the error logs as
		 * fast as we can.
		 */
		pg_usleep(1000000L);

		/*
		 * Close all open files after any error.  This is helpful on Windows,
		 * where holding deleted files open causes various strange errors.
		 * It's not clear we need it elsewhere, but shouldn't hurt.
		 */
		smgrcloseall();

		/* Report wait end here, when there is no further possibility of wait */
		pgstat_report_wait_end();
	}

	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

	/*
	 * Reset hibernation state after any error.
	 */
	prev_hibernate = false;

	/*
	 * Loop forever
	 */
	for (;;)
	{
		bool		can_hibernate;
		int			rc;

		/* Clear any already-pending wakeups */
		ResetLatch(MyLatch);

		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);
		}
		if (shutdown_requested)
		{
			/*
			 * From here on, elog(ERROR) should end with exit(1), not send
			 * control back to the sigsetjmp block above
			 */
			ExitOnAnyError = true;
			/* Normal exit from the bgwriter is here */
			proc_exit(0);		/* done */
		}

		/*
		 * Do one cycle of dirty-buffer writing.
		 */
		can_hibernate = BgBufferSync();

		/*
		 * Send off activity statistics to the stats collector
		 */
		pgstat_send_bgwriter();

		if (FirstCallSinceLastCheckpoint())
		{
			/*
			 * After any checkpoint, close all smgr files.  This is so we
			 * won't hang onto smgr references to deleted files indefinitely.
			 */
			smgrcloseall();
		}

		/*
		 * Log a new xl_running_xacts every now and then so replication can
		 * get into a consistent state faster (think of suboverflowed
		 * snapshots) and clean up resources (locks, KnownXids*) more
		 * frequently. The costs of this are relatively low, so doing it 4
		 * times (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine.
		 *
		 * We assume the interval for writing xl_running_xacts is
		 * significantly bigger than BgWriterDelay, so we don't complicate the
		 * overall timeout handling but just assume we're going to get called
		 * often enough even if hibernation mode is active. It's not that
		 * important that log_snap_interval_ms is met strictly. To make sure
		 * we're not waking the disk up unnecessarily on an idle system we
		 * check whether there has been any WAL inserted since the last time
		 * we've logged a running xacts.
		 *
		 * We do this logging in the bgwriter as its the only process that is
		 * run regularly and returns to its mainloop all the time. E.g.
		 * Checkpointer, when active, is barely ever in its mainloop and thus
		 * makes it hard to log regularly.
		 */
		if (XLogStandbyInfoActive() && !RecoveryInProgress())
		{
			TimestampTz timeout = 0;
			TimestampTz now = GetCurrentTimestamp();

			timeout = TimestampTzPlusMilliseconds(last_snapshot_ts,
												  LOG_SNAPSHOT_INTERVAL_MS);

			/*
			 * only log if enough time has passed and some xlog record has
			 * been inserted.
			 */
			if (now >= timeout &&
				last_snapshot_lsn != GetXLogInsertRecPtr())
			{
				last_snapshot_lsn = LogStandbySnapshot();
				last_snapshot_ts = now;
			}
		}

		/*
		 * Sleep until we are signaled or BgWriterDelay has elapsed.
		 *
		 * Note: the feedback control loop in BgBufferSync() expects that we
		 * will call it every BgWriterDelay msec.  While it's not critical for
		 * correctness that that be exact, the feedback loop might misbehave
		 * if we stray too far from that.  Hence, avoid loading this process
		 * down with latch events that are likely to happen frequently during
		 * normal operation.
		 */
		rc = WaitLatch(MyLatch,
					   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
					   BgWriterDelay /* ms */ );

		/*
		 * If no latch event and BgBufferSync says nothing's happening, extend
		 * the sleep in "hibernation" mode, where we sleep for much longer
		 * than bgwriter_delay says.  Fewer wakeups save electricity.  When a
		 * backend starts using buffers again, it will wake us up by setting
		 * our latch.  Because the extra sleep will persist only as long as no
		 * buffer allocations happen, this should not distort the behavior of
		 * BgBufferSync's control loop too badly; essentially, it will think
		 * that the system-wide idle interval didn't exist.
		 *
		 * There is a race condition here, in that a backend might allocate a
		 * buffer between the time BgBufferSync saw the alloc count as zero
		 * and the time we call StrategyNotifyBgWriter.  While it's not
		 * critical that we not hibernate anyway, we try to reduce the odds of
		 * that by only hibernating when BgBufferSync says nothing's happening
		 * for two consecutive cycles.  Also, we mitigate any possible
		 * consequences of a missed wakeup by not hibernating forever.
		 */
		if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate)
		{
			/* Ask for notification at next buffer allocation */
			StrategyNotifyBgWriter(MyProc->pgprocno);
			/* Sleep ... */
			rc = WaitLatch(MyLatch,
						   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
						   BgWriterDelay * HIBERNATE_FACTOR);
			/* Reset the notification request in case we timed out */
			StrategyNotifyBgWriter(-1);
		}

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (rc & WL_POSTMASTER_DEATH)
			exit(1);

		prev_hibernate = can_hibernate;
	}
}
Example #21
0
/*
 * LWLockAcquire - acquire a lightweight lock in the specified mode
 *
 * If the lock is not available, sleep until it is.
 *
 * Side effect: cancel/die interrupts are held off until lock release.
 */
void
LWLockAcquire(LWLockId lockid, LWLockMode mode)
{
	volatile LWLock *lock = &(LWLockArray[lockid].lock);
	PGPROC	   *proc = MyProc;
	bool		retry = false;
	int			extraWaits = 0;

	PRINT_LWDEBUG("LWLockAcquire", lockid, lock);

#ifdef LWLOCK_STATS
	/* Set up local count state first time through in a given process */
	if (counts_for_pid != MyProcPid)
	{
		int		   *LWLockCounter = (int *) ((char *) LWLockArray - 2 * sizeof(int));
		int			numLocks = LWLockCounter[1];

		sh_acquire_counts = calloc(numLocks, sizeof(int));
		ex_acquire_counts = calloc(numLocks, sizeof(int));
		block_counts = calloc(numLocks, sizeof(int));
		counts_for_pid = MyProcPid;
		on_shmem_exit(print_lwlock_stats, 0);
	}
	/* Count lock acquisition attempts */
	if (mode == LW_EXCLUSIVE)
		ex_acquire_counts[lockid]++;
	else
		sh_acquire_counts[lockid]++;
#endif   /* LWLOCK_STATS */

	/*
	 * We can't wait if we haven't got a PGPROC.  This should only occur
	 * during bootstrap or shared memory initialization.  Put an Assert here
	 * to catch unsafe coding practices.
	 */
	Assert(!(proc == NULL && IsUnderPostmaster));

	/* Ensure we will have room to remember the lock */
	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
		elog(ERROR, "too many LWLocks taken");

	/*
	 * Lock out cancel/die interrupts until we exit the code section protected
	 * by the LWLock.  This ensures that interrupts will not interfere with
	 * manipulations of data structures in shared memory.
	 */
	HOLD_INTERRUPTS();

	/*
	 * Loop here to try to acquire lock after each time we are signaled by
	 * LWLockRelease.
	 *
	 * NOTE: it might seem better to have LWLockRelease actually grant us the
	 * lock, rather than retrying and possibly having to go back to sleep. But
	 * in practice that is no good because it means a process swap for every
	 * lock acquisition when two or more processes are contending for the same
	 * lock.  Since LWLocks are normally used to protect not-very-long
	 * sections of computation, a process needs to be able to acquire and
	 * release the same lock many times during a single CPU time slice, even
	 * in the presence of contention.  The efficiency of being able to do that
	 * outweighs the inefficiency of sometimes wasting a process dispatch
	 * cycle because the lock is not free when a released waiter finally gets
	 * to run.	See pgsql-hackers archives for 29-Dec-01.
	 */
	for (;;)
	{
		bool		mustwait;

		/* Acquire mutex.  Time spent holding mutex should be short! */
		SpinLockAcquire(&lock->mutex);

		/* If retrying, allow LWLockRelease to release waiters again */
		if (retry)
			lock->releaseOK = true;

		/* If I can get the lock, do so quickly. */
		if (mode == LW_EXCLUSIVE)
		{
			if (lock->exclusive == 0 && lock->shared == 0)
			{
				lock->exclusive++;
				mustwait = false;
			}
			else
				mustwait = true;
		}
		else
		{
			if (lock->exclusive == 0)
			{
				lock->shared++;
				mustwait = false;
			}
			else
				mustwait = true;
		}

		if (!mustwait)
			break;				/* got the lock */

		/*
		 * Add myself to wait queue.
		 *
		 * If we don't have a PGPROC structure, there's no way to wait. This
		 * should never occur, since MyProc should only be null during shared
		 * memory initialization.
		 */
		if (proc == NULL)
			elog(PANIC, "cannot wait without a PGPROC structure");

		proc->lwWaiting = true;
		proc->lwExclusive = (mode == LW_EXCLUSIVE);
		proc->lwWaitLink = NULL;
		if (lock->head == NULL)
			lock->head = proc;
		else
			lock->tail->lwWaitLink = proc;
		lock->tail = proc;

		/* Can release the mutex now */
		SpinLockRelease(&lock->mutex);

		/*
		 * Wait until awakened.
		 *
		 * Since we share the process wait semaphore with the regular lock
		 * manager and ProcWaitForSignal, and we may need to acquire an LWLock
		 * while one of those is pending, it is possible that we get awakened
		 * for a reason other than being signaled by LWLockRelease. If so,
		 * loop back and wait again.  Once we've gotten the LWLock,
		 * re-increment the sema by the number of additional signals received,
		 * so that the lock manager or signal manager will see the received
		 * signal when it next waits.
		 */
		LOG_LWDEBUG("LWLockAcquire", lockid, "waiting");

#ifdef LWLOCK_STATS
		block_counts[lockid]++;
#endif

		TRACE_POSTGRESQL_LWLOCK_WAIT_START(lockid, mode);

		for (;;)
		{
			/* "false" means cannot accept cancel/die interrupt here. */
			PGSemaphoreLock(&proc->sem, false);
			if (!proc->lwWaiting)
				break;
			extraWaits++;
		}

		TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(lockid, mode);

		LOG_LWDEBUG("LWLockAcquire", lockid, "awakened");

		/* Now loop back and try to acquire lock again. */
		retry = true;
	}

	/* We are done updating shared state of the lock itself. */
	SpinLockRelease(&lock->mutex);

	TRACE_POSTGRESQL_LWLOCK_ACQUIRE(lockid, mode);

	/* Add lock to list of locks held by this backend */
	held_lwlocks[num_held_lwlocks++] = lockid;

	/*
	 * Fix the process wait semaphore's count for any absorbed wakeups.
	 */
	while (extraWaits-- > 0)
		PGSemaphoreUnlock(&proc->sem);
}
void FileRepResetPeer_Main(void)
{
	/* BASIC PROCESS SETUP */

	FileRepReset_ConfigureSignals();

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * See notes in postgres.c about the design of this coding and comments about how the error
	 * handling works.
	 */
	sigjmp_buf		local_sigjmp_buf;
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		HOLD_INTERRUPTS();
		EmitErrorReport();
		proc_exit(EXIT_CODE_SHOULD_ENTER_FAULT);
	}
	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;
	PG_SETMASK(&UnBlockSig);


	/** NOW DO THE ACTUAL WORK */ 
	char messageFromPeer[MESSAGE_FROM_PEER_BUF_SIZE];
	char resetNumberFromPeer[MESSAGE_FROM_PEER_BUF_SIZE];
	char resetNumberThatIndicatesResetComplete[MESSAGE_FROM_PEER_BUF_SIZE];
	struct addrinfo *addrList = NULL;
	char portStr[100];

	PrimaryMirrorModeTransitionArguments args = primaryMirrorGetArgumentsFromLocalMemory();
	Assert(args.mode == PMModePrimarySegment || args.mode == PMModeMirrorSegment);

	snprintf(portStr, sizeof(portStr), "%d", args.peerPostmasterPort);
	if (! determineTargetHost(&addrList, args.peerAddress, portStr))
	{
		elog(WARNING, "during reset, unable to look up address for peer host to coordinate reset; "
				"will transition to fault state.");
		proc_exit(EXIT_CODE_SHOULD_ENTER_FAULT);
	}

	sendMessageToPeerAndExitIfProblem(addrList, "beginPostmasterReset", messageFromPeer,
		resetNumberThatIndicatesResetComplete);

	for ( ;; )
	{
		pg_usleep(10 * 1000L); /* 10 ms */
		sendMessageToPeerAndExitIfProblem(addrList, "getPostmasterResetStatus", messageFromPeer, resetNumberFromPeer );
		if (strequals(messageFromPeer, RESET_STATUS_IS_IN_RESET_PIVOT_POINT))
		{
			if (args.mode == PMModeMirrorSegment)
			{
				/**
				 * peer is in the reset pivot point, we can break out of our checking loop and
				 *   thus exit with a code telling the postmaster to begin the startup sequence again
				 *
				 * this is only done on the mirror as currently the mirror must execute the startup sequence
				 *   before the primary
				 */
				elog(DEBUG1, "peer reset: primary peer has reached reset point");
				break;
			}
		}
		else if (strequals(messageFromPeer, RESET_STATUS_IS_RUNNING))
		{
			/** it's running -- is it >= than the reset number that indicates reset complete one */
			if (strcmp( resetNumberFromPeer, resetNumberThatIndicatesResetComplete) >= 0)
			{
				/** yes, the reset is complete and so we can quit and do a restart */
				elog(DEBUG1, "peer reset: mirror peer reset is complete");
				break;
			}
		}
	}

	proc_exit(EXIT_CODE_SHOULD_RESTART_SHMEM_CLEANLY);
}
Example #23
0
/*
 * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
 *
 * If the lock is not available, return FALSE with no side-effects.
 *
 * If successful, cancel/die interrupts are held off until lock release.
 */
bool
LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
{
	volatile LWLock *lock = &(LWLockArray[lockid].lock);
	bool		mustwait;

	PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, lock);

	/* Ensure we will have room to remember the lock */
	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
		elog(ERROR, "too many LWLocks taken");

	/*
	 * Lock out cancel/die interrupts until we exit the code section protected
	 * by the LWLock.  This ensures that interrupts will not interfere with
	 * manipulations of data structures in shared memory.
	 */
	HOLD_INTERRUPTS();

	/* Acquire mutex.  Time spent holding mutex should be short! */
	SpinLockAcquire(&lock->mutex);

	/* If I can get the lock, do so quickly. */
	if (mode == LW_EXCLUSIVE)
	{
		if (lock->exclusive == 0 && lock->shared == 0)
		{
			lock->exclusive++;
			mustwait = false;
		}
		else
			mustwait = true;
	}
	else
	{
		if (lock->exclusive == 0)
		{
			lock->shared++;
			mustwait = false;
		}
		else
			mustwait = true;
	}

	/* We are done updating shared state of the lock itself. */
	SpinLockRelease(&lock->mutex);

	if (mustwait)
	{
		/* Failed to get lock, so release interrupt holdoff */
		RESUME_INTERRUPTS();
		LOG_LWDEBUG("LWLockConditionalAcquire", lockid, "failed");
		TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(lockid, mode);
	}
	else
	{
		/* Add lock to list of locks held by this backend */
		held_lwlocks[num_held_lwlocks++] = lockid;
		TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(lockid, mode);
	}

	return !mustwait;
}
Example #24
0
/*
 * LWLockAcquireOrWait - Acquire lock, or wait until it's free
 *
 * The semantics of this function are a bit funky.  If the lock is currently
 * free, it is acquired in the given mode, and the function returns true.  If
 * the lock isn't immediately free, the function waits until it is released
 * and returns false, but does not acquire the lock.
 *
 * This is currently used for WALWriteLock: when a backend flushes the WAL,
 * holding WALWriteLock, it can flush the commit records of many other
 * backends as a side-effect.  Those other backends need to wait until the
 * flush finishes, but don't need to acquire the lock anymore.  They can just
 * wake up, observe that their records have already been flushed, and return.
 */
bool
LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
{
	PGPROC	   *proc = MyProc;
	bool		mustwait;
	int			extraWaits = 0;
#ifdef LWLOCK_STATS
	lwlock_stats *lwstats;
#endif

	PRINT_LWDEBUG("LWLockAcquireOrWait", lock);

#ifdef LWLOCK_STATS
	lwstats = get_lwlock_stats_entry(lock);
#endif

	/* Ensure we will have room to remember the lock */
	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
		elog(ERROR, "too many LWLocks taken");

	/*
	 * Lock out cancel/die interrupts until we exit the code section protected
	 * by the LWLock.  This ensures that interrupts will not interfere with
	 * manipulations of data structures in shared memory.
	 */
	HOLD_INTERRUPTS();

	/* Acquire mutex.  Time spent holding mutex should be short! */
	SpinLockAcquire(&lock->mutex);

	/* If I can get the lock, do so quickly. */
	if (mode == LW_EXCLUSIVE)
	{
		if (lock->exclusive == 0 && lock->shared == 0)
		{
			lock->exclusive++;
			mustwait = false;
		}
		else
			mustwait = true;
	}
	else
	{
		if (lock->exclusive == 0)
		{
			lock->shared++;
			mustwait = false;
		}
		else
			mustwait = true;
	}

	if (mustwait)
	{
		/*
		 * Add myself to wait queue.
		 *
		 * If we don't have a PGPROC structure, there's no way to wait.  This
		 * should never occur, since MyProc should only be null during shared
		 * memory initialization.
		 */
		if (proc == NULL)
			elog(PANIC, "cannot wait without a PGPROC structure");

		proc->lwWaiting = true;
		proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
		proc->lwWaitLink = NULL;
		if (lock->head == NULL)
			lock->head = proc;
		else
			lock->tail->lwWaitLink = proc;
		lock->tail = proc;

		/* Can release the mutex now */
		SpinLockRelease(&lock->mutex);

		/*
		 * Wait until awakened.  Like in LWLockAcquire, be prepared for bogus
		 * wakups, because we share the semaphore with ProcWaitForSignal.
		 */
		LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock),
					"waiting");

#ifdef LWLOCK_STATS
		lwstats->block_count++;
#endif

		TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), T_ID(lock), mode);

		for (;;)
		{
			/* "false" means cannot accept cancel/die interrupt here. */
			PGSemaphoreLock(&proc->sem, false);
			if (!proc->lwWaiting)
				break;
			extraWaits++;
		}

		TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock), mode);

		LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock),
					"awakened");
	}
	else
	{
		/* We are done updating shared state of the lock itself. */
		SpinLockRelease(&lock->mutex);
	}

	/*
	 * Fix the process wait semaphore's count for any absorbed wakeups.
	 */
	while (extraWaits-- > 0)
		PGSemaphoreUnlock(&proc->sem);

	if (mustwait)
	{
		/* Failed to get lock, so release interrupt holdoff */
		RESUME_INTERRUPTS();
		LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), "failed");
		TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), T_ID(lock),
													 mode);
	}
	else
	{
		/* Add lock to list of locks held by this backend */
		held_lwlocks[num_held_lwlocks++] = lock;
		TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), T_ID(lock),
												mode);
	}

	return !mustwait;
}
Example #25
0
/*
 * LWLockAcquire - acquire a lightweight lock in the specified mode
 *
 * If the lock is not available, sleep until it is.
 *
 * Side effect: cancel/die interrupts are held off until lock release.
 */
void
LWLockAcquire(LWLockId lockid, LWLockMode mode)
{
	volatile LWLock *lock = &(LWLockArray[lockid].lock);
#if LWLOCK_LOCK_PARTS > 1
	volatile LWLockPart *part = LWLOCK_PART(lock, lockid, MyBackendId);
#endif
	PGPROC	   *proc = MyProc;
	bool		retry = false;
	int			extraWaits = 0;

	PRINT_LWDEBUG("LWLockAcquire", lockid, lock);

#ifdef LWLOCK_STATS
	/* Set up local count state first time through in a given process */
	if (counts_for_pid != MyProcPid)
	{
		int		   *LWLockCounter = (int *) ((char *) LWLockArray - 2 * sizeof(int));
		int			numLocks = LWLockCounter[1];

		sh_acquire_counts = calloc(numLocks, sizeof(int));
		ex_acquire_counts = calloc(numLocks, sizeof(int));
		block_counts = calloc(numLocks, sizeof(int));
		counts_for_pid = MyProcPid;
		on_shmem_exit(print_lwlock_stats, 0);
	}
	/* Count lock acquisition attempts */
	if (mode == LW_EXCLUSIVE)
		ex_acquire_counts[lockid]++;
	else
		sh_acquire_counts[lockid]++;
#endif   /* LWLOCK_STATS */

	/*
	 * We can't wait if we haven't got a PGPROC.  This should only occur
	 * during bootstrap or shared memory initialization.  Put an Assert here
	 * to catch unsafe coding practices.
	 */
	Assert(!(proc == NULL && IsUnderPostmaster));

	/* Ensure we will have room to remember the lock */
	if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
		elog(ERROR, "too many LWLocks taken");
		
	/*
	 * Lock out cancel/die interrupts until we exit the code section protected
	 * by the LWLock.  This ensures that interrupts will not interfere with
	 * manipulations of data structures in shared memory.
	 */
	HOLD_INTERRUPTS();

	/*
	 * Loop here to try to acquire lock after each time we are signaled by
	 * LWLockRelease.
	 *
	 * NOTE: it might seem better to have LWLockRelease actually grant us the
	 * lock, rather than retrying and possibly having to go back to sleep. But
	 * in practice that is no good because it means a process swap for every
	 * lock acquisition when two or more processes are contending for the same
	 * lock.  Since LWLocks are normally used to protect not-very-long
	 * sections of computation, a process needs to be able to acquire and
	 * release the same lock many times during a single CPU time slice, even
	 * in the presence of contention.  The efficiency of being able to do that
	 * outweighs the inefficiency of sometimes wasting a process dispatch
	 * cycle because the lock is not free when a released waiter finally gets
	 * to run.	See pgsql-hackers archives for 29-Dec-01.
	 */
	for (;;)
	{
		bool		mustwait;
		
		if (mode == LW_SHARED)
		{
#ifdef		LWLOCK_PART_SHARED_OPS_ATOMIC
			/* Increment shared counter partition. If there's no contention,
			 * this is sufficient to take the lock
			 */
			LWLOCK_PART_SHARED_POSTINC_ATOMIC(lock, lockid, part, MyBackendId);
			LWLOCK_PART_SHARED_FENCE();
			
			/* A concurrent exclusive locking attempt does the following
			 * three steps
			 *   1) Acquire mutex
			 *   2) Check shared counter partitions for readers.
			 *   3a) If found add proc to wait queue, block, restart at (1)
			 *   3b) If not found, set exclusive flag, continue with (4)
			 *   4) Enter protected section
			 * The fence after the atomic add above ensures that no further
			 * such attempt can proceed to (3b) or beyond. There may be
			 * pre-existing exclusive locking attempts at step (3b) or beyond,
			 * but we can recognize those by either the mutex being taken, or
			 * the exclusive flag being set. Conversely, if we see neither, we
			 * may proceed and enter the protected section.
			 *
			 * FIXME: This doesn't work if slock_t is a struct or doesn't
			 * use 0 for state "unlocked".
			 */

			if ((lock->mutex == 0) && (lock->exclusive == 0)) {
				/* If retrying, allow LWLockRelease to release waiters again.
				 * Usually this happens after we acquired the mutex, but if
				 * we skip that, we still need to set releaseOK.
				 *
				 * Acquiring the mutex here is not really an option - if many
				 * reader are awoken simultaneously by an exclusive unlock,
				 * that would be a source of considerable contention.
				 *
				 * Fotunately, this is safe even without the mutex. First,
				 * there actually cannot be any non-fast path unlocking
				 * attempt in progress, because we'd then either still see
				 * the exclusive flag set or the mutex being taken. And
				 * even if there was, and such an attempt cleared the flag
				 * immediately after we set it, it'd also wake up some waiter
				 * who'd then re-set the flag.
				 *
				 * The only reason to do this here, and not directly
				 * after returning from PGSemaphoreLock(), is that it seems
				 * benefical to make SpinLockAcquire() the first thing to
				 * touch the lock if possible, in case we acquire the spin
				 * lock at all. That way, the cache line doesn't go through
				 * a possible shared state, but instead directly to exclusive.
				 * On Opterons at least, there seems to be a difference, c.f.
				 * the comment above tas() for x86_64 in s_lock.h
				 */
				if (retry && !lock->releaseOK)
					lock->releaseOK = true;
					
				goto lock_acquired;
			}
				
			/* At this point, we don't know if the concurrent exclusive locker
			 * has proceeded to (3b) or blocked. We must take the mutex and
			 * re-check
			 */
#endif /* LWLOCK_PART_SHARED_OPS_ATOMIC */
			
			/* Acquire mutex.  Time spent holding mutex should be short! */
			SpinLockAcquire(&lock->mutex);
			
			if (lock->exclusive == 0)
			{
#ifdef LWLOCK_PART_SHARED_OPS_ATOMIC
				/* Already incremented the shared counter partition above */
#else
				lock->shared++;
#endif
				mustwait = false;
			}
			else
			{
#ifdef LWLOCK_PART_SHARED_OPS_ATOMIC
				/* Must undo shared counter partition increment. Note that
				 * we *need* to do that while holding the mutex. Otherwise,
				 * the exclusive lock could be released and attempted to be
				 * re-acquired before we undo the increment. That attempt
				 * would then block, even though there'd be no lock holder
				 * left
				 */
				LWLOCK_PART_SHARED_POSTDEC_ATOMIC(lock, lockid, part, MyBackendId);
#endif
				mustwait = true;
			}
		}
		else
		{
			/* Step (1). Acquire mutex. Time spent holding mutex should be
			 *                          short!
			 */
			SpinLockAcquire(&lock->mutex);
			
			if (lock->exclusive == 0)
			{
				/* Step (2). Check for shared lockers. This surely happens
				 * after (1), otherwise SpinLockAcquire() is broken. Lock
				 * acquire semantics demand that no load must be re-ordered
				 * from after a lock acquisition to before, for obvious
				 * reasons.
				 */

				LWLOCK_IS_SHARED(mustwait, lock, lockid);
				
				if (!mustwait) {
					/* Step (3a). Set exclusive flag. This surely happens
					 * after (2) because it depends on the result of (2),
					 * no matter how much reordering is going on here.
					 */
					lock->exclusive++;
				}
			}
			else
				mustwait = true;
		}
		
		/* If retrying, allow LWLockRelease to release waiters again.
		 * This is also separately done in the LW_SHARED early exit case
		 * above, and in contrast to there we don't hold the mutex there.
		 * See the comment there for why this is safe
		 */
		if (retry)
			lock->releaseOK = true;
		
		if (!mustwait)
			break;				/* got the lock */
			
		/*
		 * Step (3b). Add myself to wait queue.
		 *
		 * If we don't have a PGPROC structure, there's no way to wait. This
		 * should never occur, since MyProc should only be null during shared
		 * memory initialization.
		 */
		if (proc == NULL)
			elog(PANIC, "cannot wait without a PGPROC structure");

		proc->lwWaiting = true;
		proc->lwExclusive = (mode == LW_EXCLUSIVE);
		proc->lwWaitLink = NULL;
		if (lock->head == NULL)
			lock->head = proc;
		else
			lock->tail->lwWaitLink = proc;
		lock->tail = proc;

		/* Can release the mutex now */
		SpinLockRelease(&lock->mutex);

		/*
		 * Wait until awakened.
		 *
		 * Since we share the process wait semaphore with the regular lock
		 * manager and ProcWaitForSignal, and we may need to acquire an LWLock
		 * while one of those is pending, it is possible that we get awakened
		 * for a reason other than being signaled by LWLockRelease. If so,
		 * loop back and wait again.  Once we've gotten the LWLock,
		 * re-increment the sema by the number of additional signals received,
		 * so that the lock manager or signal manager will see the received
		 * signal when it next waits.
		 */
		LOG_LWDEBUG("LWLockAcquire", lockid, "waiting");

#ifdef LWLOCK_STATS
		block_counts[lockid]++;
#endif

		TRACE_POSTGRESQL_LWLOCK_WAIT_START(lockid, mode);

		for (;;)
		{
			/* "false" means cannot accept cancel/die interrupt here. */
			PGSemaphoreLock(&proc->sem, false);
			if (!proc->lwWaiting)
				break;
			extraWaits++;
		}

		TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(lockid, mode);

		LOG_LWDEBUG("LWLockAcquire", lockid, "awakened");

		/* Now loop back and try to acquire lock again. */
		retry = true;
	}

	/* We are done updating shared state of the lock itself. */
	SpinLockRelease(&lock->mutex);
	
	/* Step 4. Enter protected section. This surely happens after (3),
	 * this time because lock release semantics demand that no store
	 * must be moved from before a lock release to after the release,
	 * again for obvious reasons
	 */

#ifdef LWLOCK_PART_SHARED_OPS_ATOMIC
lock_acquired:
#endif

	TRACE_POSTGRESQL_LWLOCK_ACQUIRE(lockid, mode);

	/* Add lock to list of locks held by this backend */
	held_lwlocks[num_held_lwlocks] = lockid;
	held_lwlocks_mode[num_held_lwlocks] = mode;
	++num_held_lwlocks;

	/*
	 * Fix the process wait semaphore's count for any absorbed wakeups.
	 */
	while (extraWaits-- > 0)
		PGSemaphoreUnlock(&proc->sem);
}
Example #26
0
/*
 * LWLockWaitForVar - Wait until lock is free, or a variable is updated.
 *
 * If the lock is held and *valptr equals oldval, waits until the lock is
 * either freed, or the lock holder updates *valptr by calling
 * LWLockUpdateVar.  If the lock is free on exit (immediately or after
 * waiting), returns true.  If the lock is still held, but *valptr no longer
 * matches oldval, returns false and sets *newval to the current value in
 * *valptr.
 *
 * It's possible that the lock holder releases the lock, but another backend
 * acquires it again before we get a chance to observe that the lock was
 * momentarily released.  We wouldn't need to wait for the new lock holder,
 * but we cannot distinguish that case, so we will have to wait.
 *
 * Note: this function ignores shared lock holders; if the lock is held
 * in shared mode, returns 'true'.
 */
bool
LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
{
	PGPROC	   *proc = MyProc;
	int			extraWaits = 0;
	bool		result = false;
#ifdef LWLOCK_STATS
	lwlock_stats *lwstats;
#endif

	PRINT_LWDEBUG("LWLockWaitForVar", lock);

#ifdef LWLOCK_STATS
	lwstats = get_lwlock_stats_entry(lock);
#endif   /* LWLOCK_STATS */

	/*
	 * Quick test first to see if it the slot is free right now.
	 *
	 * XXX: the caller uses a spinlock before this, so we don't need a memory
	 * barrier here as far as the current usage is concerned.  But that might
	 * not be safe in general.
	 */
	if (lock->exclusive == 0)
		return true;

	/*
	 * Lock out cancel/die interrupts while we sleep on the lock.  There is no
	 * cleanup mechanism to remove us from the wait queue if we got
	 * interrupted.
	 */
	HOLD_INTERRUPTS();

	/*
	 * Loop here to check the lock's status after each time we are signaled.
	 */
	for (;;)
	{
		bool		mustwait;
		uint64		value;

		/* Acquire mutex.  Time spent holding mutex should be short! */
#ifdef LWLOCK_STATS
		lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex);
#else
		SpinLockAcquire(&lock->mutex);
#endif

		/* Is the lock now free, and if not, does the value match? */
		if (lock->exclusive == 0)
		{
			result = true;
			mustwait = false;
		}
		else
		{
			value = *valptr;
			if (value != oldval)
			{
				result = false;
				mustwait = false;
				*newval = value;
			}
			else
				mustwait = true;
		}

		if (!mustwait)
			break;				/* the lock was free or value didn't match */

		/*
		 * Add myself to wait queue.
		 */
		proc->lwWaiting = true;
		proc->lwWaitMode = LW_WAIT_UNTIL_FREE;
		/* waiters are added to the front of the queue */
		proc->lwWaitLink = lock->head;
		if (lock->head == NULL)
			lock->tail = proc;
		lock->head = proc;

		/*
		 * Set releaseOK, to make sure we get woken up as soon as the lock is
		 * released.
		 */
		lock->releaseOK = true;

		/* Can release the mutex now */
		SpinLockRelease(&lock->mutex);

		/*
		 * Wait until awakened.
		 *
		 * Since we share the process wait semaphore with the regular lock
		 * manager and ProcWaitForSignal, and we may need to acquire an LWLock
		 * while one of those is pending, it is possible that we get awakened
		 * for a reason other than being signaled by LWLockRelease. If so,
		 * loop back and wait again.  Once we've gotten the LWLock,
		 * re-increment the sema by the number of additional signals received,
		 * so that the lock manager or signal manager will see the received
		 * signal when it next waits.
		 */
		LOG_LWDEBUG("LWLockWaitForVar", T_NAME(lock), T_ID(lock), "waiting");

#ifdef LWLOCK_STATS
		lwstats->block_count++;
#endif

		TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), T_ID(lock),
										   LW_EXCLUSIVE);

		for (;;)
		{
			/* "false" means cannot accept cancel/die interrupt here. */
			PGSemaphoreLock(&proc->sem, false);
			if (!proc->lwWaiting)
				break;
			extraWaits++;
		}

		TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock),
										  LW_EXCLUSIVE);

		LOG_LWDEBUG("LWLockWaitForVar", T_NAME(lock), T_ID(lock), "awakened");

		/* Now loop back and check the status of the lock again. */
	}

	/* We are done updating shared state of the lock itself. */
	SpinLockRelease(&lock->mutex);

	TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), T_ID(lock), LW_EXCLUSIVE);

	/*
	 * Fix the process wait semaphore's count for any absorbed wakeups.
	 */
	while (extraWaits-- > 0)
		PGSemaphoreUnlock(&proc->sem);

	/*
	 * Now okay to allow cancel/die interrupts.
	 */
	RESUME_INTERRUPTS();

	return result;
}
Example #27
0
/*
 * CreateShardsWithRoundRobinPolicy creates empty shards for the given table
 * based on the specified number of initial shards. The function first updates
 * metadata on the coordinator node to make this shard (and its placements)
 * visible. Note that the function assumes the table is hash partitioned and
 * calculates the min/max hash token ranges for each shard, giving them an equal
 * split of the hash space. Finally, function creates empty shard placements on
 * worker nodes.
 */
void
CreateShardsWithRoundRobinPolicy(Oid distributedTableId, int32 shardCount,
								 int32 replicationFactor, bool useExclusiveConnections)
{
	char shardStorageType = 0;
	List *workerNodeList = NIL;
	int32 workerNodeCount = 0;
	uint32 placementAttemptCount = 0;
	uint64 hashTokenIncrement = 0;
	List *existingShardList = NIL;
	int64 shardIndex = 0;
	DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(distributedTableId);
	bool colocatedShard = false;
	List *insertedShardPlacements = NIL;

	/* make sure table is hash partitioned */
	CheckHashPartitionedTable(distributedTableId);

	/*
	 * In contrast to append/range partitioned tables it makes more sense to
	 * require ownership privileges - shards for hash-partitioned tables are
	 * only created once, not continually during ingest as for the other
	 * partitioning types.
	 */
	EnsureTableOwner(distributedTableId);

	/* we plan to add shards: get an exclusive lock on relation oid */
	LockRelationOid(distributedTableId, ExclusiveLock);

	/* validate that shards haven't already been created for this table */
	existingShardList = LoadShardList(distributedTableId);
	if (existingShardList != NIL)
	{
		char *tableName = get_rel_name(distributedTableId);
		ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
						errmsg("table \"%s\" has already had shards created for it",
							   tableName)));
	}

	/* make sure that at least one shard is specified */
	if (shardCount <= 0)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("shard_count must be positive")));
	}

	/* make sure that at least one replica is specified */
	if (replicationFactor <= 0)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("replication_factor must be positive")));
	}

	/* make sure that RF=1 if the table is streaming replicated */
	if (cacheEntry->replicationModel == REPLICATION_MODEL_STREAMING &&
		replicationFactor > 1)
	{
		char *relationName = get_rel_name(cacheEntry->relationId);
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("using replication factor %d with the streaming "
							   "replication model is not supported",
							   replicationFactor),
						errdetail("The table %s is marked as streaming replicated and "
								  "the shard replication factor of streaming replicated "
								  "tables must be 1.", relationName),
						errhint("Use replication factor 1.")));
	}

	/* calculate the split of the hash space */
	hashTokenIncrement = HASH_TOKEN_COUNT / shardCount;

	/* don't allow concurrent node list changes that require an exclusive lock */
	LockRelationOid(DistNodeRelationId(), RowShareLock);

	/* load and sort the worker node list for deterministic placement */
	workerNodeList = ActivePrimaryNodeList();
	workerNodeList = SortList(workerNodeList, CompareWorkerNodes);

	/* make sure we don't process cancel signals until all shards are created */
	HOLD_INTERRUPTS();

	workerNodeCount = list_length(workerNodeList);
	if (replicationFactor > workerNodeCount)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("replication_factor (%d) exceeds number of worker nodes "
							   "(%d)", replicationFactor, workerNodeCount),
						errhint("Add more worker nodes or try again with a lower "
								"replication factor.")));
	}

	/* if we have enough nodes, add an extra placement attempt for backup */
	placementAttemptCount = (uint32) replicationFactor;
	if (workerNodeCount > replicationFactor)
	{
		placementAttemptCount++;
	}

	/* set shard storage type according to relation type */
	shardStorageType = ShardStorageType(distributedTableId);

	for (shardIndex = 0; shardIndex < shardCount; shardIndex++)
	{
		uint32 roundRobinNodeIndex = shardIndex % workerNodeCount;

		/* initialize the hash token space for this shard */
		text *minHashTokenText = NULL;
		text *maxHashTokenText = NULL;
		int32 shardMinHashToken = INT32_MIN + (shardIndex * hashTokenIncrement);
		int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1);
		uint64 shardId = GetNextShardId();
		List *currentInsertedShardPlacements = NIL;

		/* if we are at the last shard, make sure the max token value is INT_MAX */
		if (shardIndex == (shardCount - 1))
		{
			shardMaxHashToken = INT32_MAX;
		}

		/* insert the shard metadata row along with its min/max values */
		minHashTokenText = IntegerToText(shardMinHashToken);
		maxHashTokenText = IntegerToText(shardMaxHashToken);

		/*
		 * Grabbing the shard metadata lock isn't technically necessary since
		 * we already hold an exclusive lock on the partition table, but we'll
		 * acquire it for the sake of completeness. As we're adding new active
		 * placements, the mode must be exclusive.
		 */
		LockShardDistributionMetadata(shardId, ExclusiveLock);

		InsertShardRow(distributedTableId, shardId, shardStorageType,
					   minHashTokenText, maxHashTokenText);

		currentInsertedShardPlacements = InsertShardPlacementRows(distributedTableId,
																  shardId,
																  workerNodeList,
																  roundRobinNodeIndex,
																  replicationFactor);
		insertedShardPlacements = list_concat(insertedShardPlacements,
											  currentInsertedShardPlacements);
	}

	CreateShardsOnWorkers(distributedTableId, insertedShardPlacements,
						  useExclusiveConnections, colocatedShard);

	if (QueryCancelPending)
	{
		ereport(WARNING, (errmsg("cancel requests are ignored during shard creation")));
		QueryCancelPending = false;
	}

	RESUME_INTERRUPTS();
}
Example #28
0
/* --------------------------------
 *		internal_flush - flush pending output
 *
 * Returns 0 if OK (meaning everything was sent, or operation would block
 * and the socket is in non-blocking mode), or EOF if trouble.
 * --------------------------------
 */
static int
internal_flush(void)
{
	static int	last_reported_send_errno = 0;

	char	   *bufptr = PqSendBuffer + PqSendStart;
	char	   *bufend = PqSendBuffer + PqSendPointer;

	while (bufptr < bufend)
	{
		int			r;

		r = secure_write(MyProcPort, bufptr, bufend - bufptr);

		if (r <= 0)
		{
			if (errno == EINTR)
				continue;		/* Ok if we were interrupted */

			/*
			 * Ok if no data writable without blocking, and the socket is in
			 * non-blocking mode.
			 */
			if (errno == EAGAIN ||
				errno == EWOULDBLOCK)
			{
				return 0;
			}

			/*
			 * Careful: an ereport() that tries to write to the client would
			 * cause recursion to here, leading to stack overflow and core
			 * dump!  This message must go *only* to the postmaster log.
			 *
			 * If a client disconnects while we're in the midst of output, we
			 * might write quite a bit of data before we get to a safe query
			 * abort point.  So, suppress duplicate log messages.
			 */
			if (errno != last_reported_send_errno)
			{
				last_reported_send_errno = errno;
				/* TDOO: what's this? */
				HOLD_INTERRUPTS();

				/* we can use ereport here, for the protection of send mutex */
				ereport(COMMERROR,
						(errcode_for_socket_access(),
						 errmsg("could not send data to client: %m")));
				RESUME_INTERRUPTS();
			}

			/*
			 * We drop the buffered data anyway so that processing can
			 * continue, even though we'll probably quit soon. We also set a
			 * flag that'll cause the next CHECK_FOR_INTERRUPTS to terminate
			 * the connection.
			 */
			PqSendStart = PqSendPointer = 0;
			ClientConnectionLost = 1;
			InterruptPending = 1;
			return EOF;
		}

		last_reported_send_errno = 0;	/* reset after any successful send */
		bufptr += r;
		PqSendStart += r;
	}

	PqSendStart = PqSendPointer = 0;
	return 0;
}
Example #29
0
/*
 * Signal handler for SIGALRM
 *
 * Process any active timeout reasons and then reschedule the interrupt
 * as needed.
 */
static void
handle_sig_alarm(SIGNAL_ARGS)
{
	int			save_errno = errno;
	bool		save_ImmediateInterruptOK = ImmediateInterruptOK;

	/*
	 * We may be executing while ImmediateInterruptOK is true (e.g., when
	 * mainline is waiting for a lock).  If SIGINT or similar arrives while
	 * this code is running, we'd lose control and perhaps leave our data
	 * structures in an inconsistent state.  Disable immediate interrupts, and
	 * just to be real sure, bump the holdoff counter as well.  (The reason
	 * for this belt-and-suspenders-too approach is to make sure that nothing
	 * bad happens if a timeout handler calls code that manipulates
	 * ImmediateInterruptOK.)
	 *
	 * Note: it's possible for a SIGINT to interrupt handle_sig_alarm before
	 * we manage to do this; the net effect would be as if the SIGALRM event
	 * had been silently lost.  Therefore error recovery must include some
	 * action that will allow any lost interrupt to be rescheduled.  Disabling
	 * some or all timeouts is sufficient, or if that's not appropriate,
	 * reschedule_timeouts() can be called.  Also, the signal blocking hazard
	 * described below applies here too.
	 */
	ImmediateInterruptOK = false;
	HOLD_INTERRUPTS();

	/*
	 * SIGALRM is always cause for waking anything waiting on the process
	 * latch.  Cope with MyProc not being there, as the startup process also
	 * uses this signal handler.
	 */
	if (MyProc)
		SetLatch(&MyProc->procLatch);

	/*
	 * Fire any pending timeouts, but only if we're enabled to do so.
	 */
	if (alarm_enabled)
	{
		/*
		 * Disable alarms, just in case this platform allows signal handlers
		 * to interrupt themselves.  schedule_alarm() will re-enable if
		 * appropriate.
		 */
		disable_alarm();

		if (num_active_timeouts > 0)
		{
			TimestampTz now = GetCurrentTimestamp();

			/* While the first pending timeout has been reached ... */
			while (num_active_timeouts > 0 &&
				   now >= active_timeouts[0]->fin_time)
			{
				timeout_params *this_timeout = active_timeouts[0];

				/* Remove it from the active list */
				remove_timeout_index(0);

				/* Mark it as fired */
				this_timeout->indicator = true;

				/* And call its handler function */
				(*this_timeout->timeout_handler) ();

				/*
				 * The handler might not take negligible time (CheckDeadLock
				 * for instance isn't too cheap), so let's update our idea of
				 * "now" after each one.
				 */
				now = GetCurrentTimestamp();
			}

			/* Done firing timeouts, so reschedule next interrupt if any */
			schedule_alarm(now);
		}
	}

	/*
	 * Re-allow query cancel, and then try to service any cancel request that
	 * arrived meanwhile (this might in particular include a cancel request
	 * fired by one of the timeout handlers).  Since we are in a signal
	 * handler, we mustn't call ProcessInterrupts unless ImmediateInterruptOK
	 * is set; if it isn't, the cancel will happen at the next mainline
	 * CHECK_FOR_INTERRUPTS.
	 *
	 * Note: a longjmp from here is safe so far as our own data structures are
	 * concerned; but on platforms that block a signal before calling the
	 * handler and then un-block it on return, longjmping out of the signal
	 * handler leaves SIGALRM still blocked.  Error cleanup is responsible for
	 * unblocking any blocked signals.
	 */
	RESUME_INTERRUPTS();
	ImmediateInterruptOK = save_ImmediateInterruptOK;
	if (save_ImmediateInterruptOK)
		CHECK_FOR_INTERRUPTS();

	errno = save_errno;
}
Example #30
0
/*
 * Main entry point for bgwriter process
 *
 * This is invoked from BootstrapMain, which has already created the basic
 * execution environment, but not enabled signals yet.
 */
void
BackgroundWriterMain(void)
{
    sigjmp_buf	local_sigjmp_buf;
    MemoryContext bgwriter_context;

    am_bg_writer = true;

    /*
     * If possible, make this process a group leader, so that the postmaster
     * can signal any child processes too.	(bgwriter probably never has any
     * child processes, but for consistency we make all postmaster child
     * processes do this.)
     */
#ifdef HAVE_SETSID
    if (setsid() < 0)
        elog(FATAL, "setsid() failed: %m");
#endif

    /*
     * Properly accept or ignore signals the postmaster might send us
     *
     * SIGUSR1 is presently unused; keep it spare in case someday we want this
     * process to participate in ProcSignal signalling.
     */
    pqsignal(SIGHUP, BgSigHupHandler);	/* set flag to read config file */
    pqsignal(SIGINT, SIG_IGN);			/* as of 9.2 no longer requests checkpoint */
    pqsignal(SIGTERM, ReqShutdownHandler); 	/* shutdown */
    pqsignal(SIGQUIT, bg_quickdie);		/* hard crash time */
    pqsignal(SIGALRM, SIG_IGN);
    pqsignal(SIGPIPE, SIG_IGN);
    pqsignal(SIGUSR1, SIG_IGN);			/* reserve for ProcSignal */
    pqsignal(SIGUSR2, SIG_IGN);

    /*
     * Reset some signals that are accepted by postmaster but not here
     */
    pqsignal(SIGCHLD, SIG_DFL);
    pqsignal(SIGTTIN, SIG_DFL);
    pqsignal(SIGTTOU, SIG_DFL);
    pqsignal(SIGCONT, SIG_DFL);
    pqsignal(SIGWINCH, SIG_DFL);

    /* We allow SIGQUIT (quickdie) at all times */
    sigdelset(&BlockSig, SIGQUIT);

    /*
     * Create a resource owner to keep track of our resources (currently only
     * buffer pins).
     */
    CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer");

    /*
     * Create a memory context that we will do all our work in.  We do this so
     * that we can reset the context during error recovery and thereby avoid
     * possible memory leaks.  Formerly this code just ran in
     * TopMemoryContext, but resetting that would be a really bad idea.
     */
    bgwriter_context = AllocSetContextCreate(TopMemoryContext,
                       "Background Writer",
                       ALLOCSET_DEFAULT_MINSIZE,
                       ALLOCSET_DEFAULT_INITSIZE,
                       ALLOCSET_DEFAULT_MAXSIZE);
    MemoryContextSwitchTo(bgwriter_context);

    /*
     * If an exception is encountered, processing resumes here.
     *
     * See notes in postgres.c about the design of this coding.
     */
    if (sigsetjmp(local_sigjmp_buf, 1) != 0)
    {
        /* Since not using PG_TRY, must reset error stack by hand */
        error_context_stack = NULL;

        /* Prevent interrupts while cleaning up */
        HOLD_INTERRUPTS();

        /* Report the error to the server log */
        EmitErrorReport();

        /*
         * These operations are really just a minimal subset of
         * AbortTransaction().	We don't have very many resources to worry
         * about in bgwriter, but we do have LWLocks, buffers, and temp files.
         */
        LWLockReleaseAll();
        AbortBufferIO();
        UnlockBuffers();
        /* buffer pins are released here: */
        ResourceOwnerRelease(CurrentResourceOwner,
                             RESOURCE_RELEASE_BEFORE_LOCKS,
                             false, true);
        /* we needn't bother with the other ResourceOwnerRelease phases */
        AtEOXact_Buffers(false);
        AtEOXact_Files();
        AtEOXact_HashTables(false);

        /*
         * Now return to normal top-level context and clear ErrorContext for
         * next time.
         */
        MemoryContextSwitchTo(bgwriter_context);
        FlushErrorState();

        /* Flush any leaked data in the top-level context */
        MemoryContextResetAndDeleteChildren(bgwriter_context);

        /* Now we can allow interrupts again */
        RESUME_INTERRUPTS();

        /*
         * Sleep at least 1 second after any error.  A write error is likely
         * to be repeated, and we don't want to be filling the error logs as
         * fast as we can.
         */
        pg_usleep(1000000L);

        /*
         * Close all open files after any error.  This is helpful on Windows,
         * where holding deleted files open causes various strange errors.
         * It's not clear we need it elsewhere, but shouldn't hurt.
         */
        smgrcloseall();
    }

    /* We can now handle ereport(ERROR) */
    PG_exception_stack = &local_sigjmp_buf;

    /*
     * Unblock signals (they were blocked when the postmaster forked us)
     */
    PG_SETMASK(&UnBlockSig);

    /*
     * Use the recovery target timeline ID during recovery
     */
    if (RecoveryInProgress())
        ThisTimeLineID = GetRecoveryTargetTLI();

    /*
     * Loop forever
     */
    for (;;)
    {
        /*
         * Emergency bailout if postmaster has died.  This is to avoid the
         * necessity for manual cleanup of all postmaster children.
         */
        if (!PostmasterIsAlive())
            exit(1);

        if (got_SIGHUP)
        {
            got_SIGHUP = false;
            ProcessConfigFile(PGC_SIGHUP);
            /* update global shmem state for sync rep */
        }
        if (shutdown_requested)
        {
            /*
             * From here on, elog(ERROR) should end with exit(1), not send
             * control back to the sigsetjmp block above
             */
            ExitOnAnyError = true;
            /* Normal exit from the bgwriter is here */
            proc_exit(0);		/* done */
        }

        /*
         * Do one cycle of dirty-buffer writing.
         */
        BgBufferSync();

        /* Nap for the configured time. */
        BgWriterNap();
    }
}