/* * pl_ist_reset - Abort the given number of ISTs and set a Python error. * * Used to handle cases where a code object fails to resolve open transactions. */ void pl_ist_reset(unsigned long count) { HOLD_INTERRUPTS(); for (; count > 0; --count) RollbackAndReleaseCurrentSubTransaction(); RESUME_INTERRUPTS(); }
/* * Cancel any pending wait for lock, when aborting a transaction, and revert * any strong lock count acquisition for a lock being acquired. * * (Normally, this would only happen if we accept a cancel/die * interrupt while waiting; but an ereport(ERROR) before or during the lock * wait is within the realm of possibility, too.) */ void LockErrorCleanup(void) { LWLock *partitionLock; DisableTimeoutParams timeouts[2]; HOLD_INTERRUPTS(); AbortStrongLockAcquire(); /* Nothing to do if we weren't waiting for a lock */ if (lockAwaited == NULL) { RESUME_INTERRUPTS(); return; } /* * Turn off the deadlock and lock timeout timers, if they are still * running (see ProcSleep). Note we must preserve the LOCK_TIMEOUT * indicator flag, since this function is executed before * ProcessInterrupts when responding to SIGINT; else we'd lose the * knowledge that the SIGINT came from a lock timeout and not an external * source. */ timeouts[0].id = DEADLOCK_TIMEOUT; timeouts[0].keep_indicator = false; timeouts[1].id = LOCK_TIMEOUT; timeouts[1].keep_indicator = true; disable_timeouts(timeouts, 2); /* Unlink myself from the wait queue, if on it (might not be anymore!) */ partitionLock = LockHashPartitionLock(lockAwaited->hashcode); LWLockAcquire(partitionLock, LW_EXCLUSIVE); if (MyProc->links.next != NULL) { /* We could not have been granted the lock yet */ RemoveFromWaitQueue(MyProc, lockAwaited->hashcode); } else { /* * Somebody kicked us off the lock queue already. Perhaps they * granted us the lock, or perhaps they detected a deadlock. If they * did grant us the lock, we'd better remember it in our local lock * table. */ if (MyProc->waitStatus == STATUS_OK) GrantAwaitedLock(); } lockAwaited = NULL; LWLockRelease(partitionLock); RESUME_INTERRUPTS(); }
/* * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode * * If the lock is not available, return FALSE with no side-effects. * * If successful, cancel/die interrupts are held off until lock release. */ bool LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode) { volatile LWLock *lock = LWLockArray + lockid; bool mustwait; PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, lock); /* * Lock out cancel/die interrupts until we exit the code section * protected by the LWLock. This ensures that interrupts will not * interfere with manipulations of data structures in shared memory. */ HOLD_INTERRUPTS(); /* Acquire mutex. Time spent holding mutex should be short! */ SpinLockAcquire_NoHoldoff(&lock->mutex); /* If I can get the lock, do so quickly. */ if (mode == LW_EXCLUSIVE) { if (lock->exclusive == 0 && lock->shared == 0) { lock->exclusive++; mustwait = false; } else mustwait = true; } else { if (lock->exclusive == 0) { lock->shared++; mustwait = false; } else mustwait = true; } /* We are done updating shared state of the lock itself. */ SpinLockRelease_NoHoldoff(&lock->mutex); if (mustwait) { /* Failed to get lock, so release interrupt holdoff */ RESUME_INTERRUPTS(); LOG_LWDEBUG("LWLockConditionalAcquire", lockid, "failed"); } else { /* Add lock to list of locks held by this backend */ Assert(num_held_lwlocks < MAX_SIMUL_LWLOCKS); held_lwlocks[num_held_lwlocks++] = lockid; } return !mustwait; }
/* * LWLockReleaseAll - release all currently-held locks * * Used to clean up after ereport(ERROR). An important difference between this * function and retail LWLockRelease calls is that InterruptHoldoffCount is * unchanged by this operation. This is necessary since InterruptHoldoffCount * has been set to an appropriate level earlier in error recovery. We could * decrement it below zero if we allow it to drop for each released lock! */ void LWLockReleaseAll(void) { while (num_held_lwlocks > 0) { HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */ LWLockRelease(held_lwlocks[num_held_lwlocks - 1]); } }
/* * Report a COMMERROR. * * This function holds an interrupt before reporting this error to avoid * a self deadlock situation, see MPP-13718 for more info. */ static void report_commerror(const char *err_msg) { HOLD_INTERRUPTS(); ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("%s",err_msg))); RESUME_INTERRUPTS(); }
inline void AbstractionLayer::Allocator::free(void *inPtr) const { if (inPtr == NULL) return; /* * See allocate(const size_t, const std::nothrow_t&) why we disable * processing of interrupts. */ HOLD_INTERRUPTS(); PG_TRY(); { pfree(unaligned(inPtr)); } PG_CATCH(); { FlushErrorState(); } PG_END_TRY(); RESUME_INTERRUPTS(); }
void FileRepSubProcess_Main() { const char *statmsg; MemoryContext fileRepSubProcessMemoryContext; sigjmp_buf local_sigjmp_buf; MyProcPid = getpid(); MyStartTime = time(NULL); /* * Create a PGPROC so we can use LWLocks in FileRep sub-processes. The * routine also register clean up at process exit */ InitAuxiliaryProcess(); InitBufferPoolBackend(); FileRepSubProcess_ConfigureSignals(); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Prevents interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); LWLockReleaseAll(); if (FileRepPrimary_IsResyncManagerOrWorker()) { LockReleaseAll(DEFAULT_LOCKMETHOD, false); } if (FileRepIsBackendSubProcess(fileRepProcessType)) { AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); } /* * We can now go away. Note that because we'll call InitProcess, a * callback will be registered to do ProcKill, which will clean up * necessary state. */ proc_exit(0); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; PG_SETMASK(&UnBlockSig); /* * Identify myself via ps */ statmsg = FileRepProcessTypeToString[fileRepProcessType]; init_ps_display(statmsg, "", "", ""); /* Create the memory context where cross-transaction state is stored */ fileRepSubProcessMemoryContext = AllocSetContextCreate(TopMemoryContext, "filerep subprocess memory context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(fileRepSubProcessMemoryContext); stateChangeRequestCounter++; FileRepSubProcess_ProcessSignals(); switch (fileRepProcessType) { case FileRepProcessTypePrimarySender: FileRepPrimary_StartSender(); break; case FileRepProcessTypeMirrorReceiver: FileRepMirror_StartReceiver(); break; case FileRepProcessTypeMirrorConsumer: case FileRepProcessTypeMirrorConsumerWriter: case FileRepProcessTypeMirrorConsumerAppendOnly1: FileRepMirror_StartConsumer(); break; case FileRepProcessTypeMirrorSenderAck: FileRepAckMirror_StartSender(); break; case FileRepProcessTypePrimaryReceiverAck: FileRepAckPrimary_StartReceiver(); break; case FileRepProcessTypePrimaryConsumerAck: FileRepAckPrimary_StartConsumer(); break; case FileRepProcessTypePrimaryRecovery: FileRepSubProcess_InitProcess(); /* * At this point, database is starting up and xlog is not yet * replayed. Initializing relcache now is dangerous, a sequential * scan of catalog tables may end up with incorrect hint bits. * E.g. a committed transaction's dirty heap pages made it to disk * but pg_clog update was still in memory and we crashed. If a * tuple inserted by this transaction is read during relcache * initialization, status of the tuple's xmin will be incorrectly * determined as "not commited" from pg_clog. And * HEAP_XMIN_INVALID hint bit will be set, rendering the tuple * perpetually invisible. Relcache initialization must be * deferred to only after all of xlog has been replayed. */ FileRepPrimary_StartRecovery(); ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); break; case FileRepProcessTypeResyncManager: FileRepSubProcess_InitProcess(); FileRepPrimary_StartResyncManager(); ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); break; case FileRepProcessTypeResyncWorker1: case FileRepProcessTypeResyncWorker2: case FileRepProcessTypeResyncWorker3: case FileRepProcessTypeResyncWorker4: FileRepSubProcess_InitProcess(); FileRepPrimary_StartResyncWorker(); ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); break; default: elog(PANIC, "unrecognized process type: %s(%d)", statmsg, fileRepProcessType); break; } switch (FileRepSubProcess_GetState()) { case FileRepStateShutdown: case FileRepStateReady: proc_exit(0); break; default: proc_exit(2); break; } }
/* * Main entry point for checkpointer process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void CheckpointerMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext checkpointer_context; CheckpointerShmem->checkpointer_pid = MyProcPid; /* * Properly accept or ignore signals the postmaster might send us * * Note: we deliberately ignore SIGTERM, because during a standard Unix * system shutdown cycle, init will SIGTERM all processes at once. We * want to wait for the backends to exit, whereupon the postmaster will * tell us it's okay to shut down (via SIGUSR2). */ pqsignal(SIGHUP, ChkptSigHupHandler); /* set flag to read config * file */ pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, chkpt_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, chkpt_sigusr1_handler); pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Initialize so that first time-driven event happens at the correct time. */ last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Checkpointer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ checkpointer_context = AllocSetContextCreate(TopMemoryContext, "Checkpointer", ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(checkpointer_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in checkpointer, but we do have LWLocks, buffers, and temp * files. */ LWLockReleaseAll(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* Warn any waiting backends that the checkpoint failed. */ if (ckpt_active) { SpinLockAcquire(&CheckpointerShmem->ckpt_lck); CheckpointerShmem->ckpt_failed++; CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; SpinLockRelease(&CheckpointerShmem->ckpt_lck); ckpt_active = false; } /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(checkpointer_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(checkpointer_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Ensure all shared memory values are set correctly for the config. Doing * this here ensures no race conditions from other concurrent updaters. */ UpdateSharedMemoryConfig(); /* * Advertise our latch that backends can use to wake us up while we're * sleeping. */ ProcGlobal->checkpointerLatch = &MyProc->procLatch; /* * Loop forever */ for (;;) { bool do_checkpoint = false; int flags = 0; pg_time_t now; int elapsed_secs; int cur_timeout; int rc; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); /* * Process any requests or signals received recently. */ AbsorbFsyncRequests(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* * Checkpointer is the last process to shut down, so we ask it to * hold the keys for a range of other tasks required most of which * have nothing to do with checkpointing at all. * * For various reasons, some config values can change dynamically * so the primary copy of them is held in shared memory to make * sure all backends see the same value. We make Checkpointer * responsible for updating the shared memory copy if the * parameter setting changes because of SIGHUP. */ UpdateSharedMemoryConfig(); } if (checkpoint_requested) { checkpoint_requested = false; do_checkpoint = true; BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); /* Normal exit from the checkpointer is here */ proc_exit(0); /* done */ } /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request, but we set the CAUSE_TIME flag * bit even if there is also an external request. */ now = (pg_time_t) time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) BgWriterStats.m_timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } /* * Do a checkpoint if requested. */ if (do_checkpoint) { bool ckpt_performed = false; bool do_restartpoint; /* * Check if we should perform a checkpoint or a restartpoint. As a * side-effect, RecoveryInProgress() initializes TimeLineID if * it's not set yet. */ do_restartpoint = RecoveryInProgress(); /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter * to acknowledge that we've started a new checkpoint. */ SpinLockAcquire(&CheckpointerShmem->ckpt_lck); flags |= CheckpointerShmem->ckpt_flags; CheckpointerShmem->ckpt_flags = 0; CheckpointerShmem->ckpt_started++; SpinLockRelease(&CheckpointerShmem->ckpt_lck); /* * The end-of-recovery checkpoint is a real checkpoint that's * performed while we're still in recovery. */ if (flags & CHECKPOINT_END_OF_RECOVERY) do_restartpoint = false; /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * since the last checkpoint start. Note in particular that this * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ if (!do_restartpoint && (flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", "checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs, elapsed_secs), errhint("Consider increasing the configuration parameter \"max_wal_size\"."))); /* * Initialize checkpointer-private variables used during * checkpoint. */ ckpt_active = true; if (do_restartpoint) ckpt_start_recptr = GetXLogReplayRecPtr(NULL); else ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ if (!do_restartpoint) { CreateCheckPoint(flags); ckpt_performed = true; } else ckpt_performed = CreateRestartPoint(flags); /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); /* * Indicate checkpoint completion to any waiting backends. */ SpinLockAcquire(&CheckpointerShmem->ckpt_lck); CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; SpinLockRelease(&CheckpointerShmem->ckpt_lck); if (ckpt_performed) { /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven * checkpoints happen at a predictable spacing. */ last_checkpoint_time = now; } else { /* * We were not able to perform the restartpoint (checkpoints * throw an ERROR in case of error). Most likely because we * have not received any new checkpoint WAL records since the * last restartpoint. Try again in 15 s. */ last_checkpoint_time = now - CheckPointTimeout + 15; } ckpt_active = false; } /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); /* * Send off activity statistics to the stats collector. (The reason * why we re-use bgwriter-related code for this is that the bgwriter * and checkpointer used to be just one process. It's probably not * worth the trouble to split the stats support into two independent * stats message types.) */ pgstat_send_bgwriter(); /* * Sleep until we are signaled or it's time for another checkpoint or * xlog file switch. */ now = (pg_time_t) time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) continue; /* no sleep for us ... */ cur_timeout = CheckPointTimeout - elapsed_secs; if (XLogArchiveTimeout > 0 && !RecoveryInProgress()) { elapsed_secs = now - last_xlog_switch_time; if (elapsed_secs >= XLogArchiveTimeout) continue; /* no sleep for us ... */ cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs); } rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, cur_timeout * 1000L /* convert to ms */, WAIT_EVENT_CHECKPOINTER_MAIN); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); } }
void CdbCheckDispatchResult_internal(struct CdbDispatcherState *ds, struct SegmentDatabaseDescriptor ***failedSegDB, int *numOfFailed, DispatchWaitMode waitMode) { int i; int j; int nFailed = 0; DispatchCommandParms *pParms; CdbDispatchResult *dispatchResult; SegmentDatabaseDescriptor *segdbDesc; Assert(ds != NULL); if (failedSegDB) *failedSegDB = NULL; if (numOfFailed) *numOfFailed = 0; /* * No-op if no work was dispatched since the last time we were called. */ if (!ds->dispatchThreads || ds->dispatchThreads->threadCount == 0) { elog(DEBUG5, "CheckDispatchResult: no threads active"); return; } /* * Wait for threads to finish. */ for (i = 0; i < ds->dispatchThreads->threadCount; i++) { pParms = &ds->dispatchThreads->dispatchCommandParmsAr[i]; Assert(pParms != NULL); /* * Does caller want to stop short? */ switch (waitMode) { case DISPATCH_WAIT_CANCEL: case DISPATCH_WAIT_FINISH: pParms->waitMode = waitMode; break; default: break; } if (gp_connections_per_thread == 0) { thread_DispatchWait(pParms); } else { elog(DEBUG4, "CheckDispatchResult: Joining to thread %d of %d", i + 1, ds->dispatchThreads->threadCount); if (pParms->thread_valid) { int pthread_err = 0; pthread_err = pthread_join(pParms->thread, NULL); if (pthread_err != 0) elog(FATAL, "CheckDispatchResult: pthread_join failed on thread %d (%lu) of %d (returned %d attempting to join to %lu)", i + 1, #ifndef _WIN32 (unsigned long) pParms->thread, #else (unsigned long) pParms->thread.p, #endif ds->dispatchThreads->threadCount, pthread_err, (unsigned long) mythread()); } } HOLD_INTERRUPTS(); pParms->thread_valid = false; MemSet(&pParms->thread, 0, sizeof(pParms->thread)); RESUME_INTERRUPTS(); /* * Examine the CdbDispatchResult objects containing the results * from this thread's QEs. */ for (j = 0; j < pParms->db_count; j++) { dispatchResult = pParms->dispatchResultPtrArray[j]; if (dispatchResult == NULL) { elog(LOG, "CheckDispatchResult: result object is NULL ? skipping."); continue; } if (dispatchResult->segdbDesc == NULL) { elog(LOG, "CheckDispatchResult: result object segment descriptor is NULL ? skipping."); continue; } segdbDesc = dispatchResult->segdbDesc; /* * segdbDesc error message is unlikely here, but check anyway. */ if (segdbDesc->errcode || segdbDesc->error_message.len) cdbdisp_mergeConnectionErrors(dispatchResult, segdbDesc); /* * Log the result */ if (DEBUG2 >= log_min_messages) cdbdisp_debugDispatchResult(dispatchResult, DEBUG2, DEBUG3); /* * Notify FTS to reconnect if connection lost or never connected. */ if (failedSegDB && PQstatus(segdbDesc->conn) == CONNECTION_BAD) { /* * Allocate storage. Caller should pfree() it. */ if (!*failedSegDB) *failedSegDB = palloc(sizeof(**failedSegDB) * (2 * getgpsegmentCount() + 1)); /* * Append to broken connection list. */ (*failedSegDB)[nFailed++] = segdbDesc; (*failedSegDB)[nFailed] = NULL; if (numOfFailed) *numOfFailed = nFailed; } /* * Zap our SegmentDatabaseDescriptor ptr because it may be * invalidated by the call to FtsHandleNetFailure() below. * Anything we need from there, we should get before this. */ dispatchResult->segdbDesc = NULL; } } /* * reset thread state (will be destroyed later on in finishCommand) */ ds->dispatchThreads->threadCount = 0; /* * It looks like everything went fine, make sure we don't miss a * user cancellation? * * The waitMode argument is NONE when we are doing "normal work". */ if (waitMode == DISPATCH_WAIT_NONE || waitMode == DISPATCH_WAIT_FINISH) CHECK_FOR_INTERRUPTS(); }
/* * master_create_worker_shards creates empty shards for the given table based * on the specified number of initial shards. The function first gets a list of * candidate nodes and issues DDL commands on the nodes to create empty shard * placements on those nodes. The function then updates metadata on the master * node to make this shard (and its placements) visible. Note that the function * assumes the table is hash partitioned and calculates the min/max hash token * ranges for each shard, giving them an equal split of the hash space. */ Datum master_create_worker_shards(PG_FUNCTION_ARGS) { text *tableNameText = PG_GETARG_TEXT_P(0); int32 shardCount = PG_GETARG_INT32(1); int32 replicationFactor = PG_GETARG_INT32(2); Oid distributedTableId = ResolveRelationId(tableNameText); char relationKind = get_rel_relkind(distributedTableId); char *tableName = text_to_cstring(tableNameText); char shardStorageType = '\0'; int32 shardIndex = 0; List *workerNodeList = NIL; List *ddlCommandList = NIL; int32 workerNodeCount = 0; uint32 placementAttemptCount = 0; uint32 hashTokenIncrement = 0; List *existingShardList = NIL; /* make sure table is hash partitioned */ CheckHashPartitionedTable(distributedTableId); /* validate that shards haven't already been created for this table */ existingShardList = LoadShardIntervalList(distributedTableId); if (existingShardList != NIL) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("table \"%s\" has already had shards created for it", tableName))); } /* make sure that at least one shard is specified */ if (shardCount <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("shardCount must be positive"))); } /* make sure that at least one replica is specified */ if (replicationFactor <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replicationFactor must be positive"))); } /* calculate the split of the hash space */ hashTokenIncrement = UINT_MAX / shardCount; /* load and sort the worker node list for deterministic placement */ workerNodeList = ParseWorkerNodeFile(WORKER_LIST_FILENAME); workerNodeList = SortList(workerNodeList, CompareWorkerNodes); /* make sure we don't process cancel signals until all shards are created */ HOLD_INTERRUPTS(); /* retrieve the DDL commands for the table */ ddlCommandList = TableDDLCommandList(distributedTableId); workerNodeCount = list_length(workerNodeList); if (replicationFactor > workerNodeCount) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replicationFactor (%d) exceeds number of worker nodes " "(%d)", replicationFactor, workerNodeCount), errhint("Add more worker nodes or try again with a lower " "replication factor."))); } /* if we have enough nodes, add an extra placement attempt for backup */ placementAttemptCount = (uint32) replicationFactor; if (workerNodeCount > replicationFactor) { placementAttemptCount++; } /* set shard storage type according to relation type */ if (relationKind == RELKIND_FOREIGN_TABLE) { shardStorageType = SHARD_STORAGE_FOREIGN; } else { shardStorageType = SHARD_STORAGE_TABLE; } for (shardIndex = 0; shardIndex < shardCount; shardIndex++) { uint64 shardId = NextSequenceId(SHARD_ID_SEQUENCE_NAME); int32 placementCount = 0; uint32 placementIndex = 0; uint32 roundRobinNodeIndex = shardIndex % workerNodeCount; List *extendedDDLCommands = ExtendedDDLCommandList(distributedTableId, shardId, ddlCommandList); /* initialize the hash token space for this shard */ text *minHashTokenText = NULL; text *maxHashTokenText = NULL; int32 shardMinHashToken = INT_MIN + (shardIndex * hashTokenIncrement); int32 shardMaxHashToken = shardMinHashToken + hashTokenIncrement - 1; /* if we are at the last shard, make sure the max token value is INT_MAX */ if (shardIndex == (shardCount - 1)) { shardMaxHashToken = INT_MAX; } for (placementIndex = 0; placementIndex < placementAttemptCount; placementIndex++) { int32 candidateNodeIndex = (roundRobinNodeIndex + placementIndex) % workerNodeCount; WorkerNode *candidateNode = (WorkerNode *) list_nth(workerNodeList, candidateNodeIndex); char *nodeName = candidateNode->nodeName; uint32 nodePort = candidateNode->nodePort; bool created = ExecuteRemoteCommandList(nodeName, nodePort, extendedDDLCommands); if (created) { uint64 shardPlacementId = 0; ShardState shardState = STATE_FINALIZED; shardPlacementId = NextSequenceId(SHARD_PLACEMENT_ID_SEQUENCE_NAME); InsertShardPlacementRow(shardPlacementId, shardId, shardState, nodeName, nodePort); placementCount++; } else { ereport(WARNING, (errmsg("could not create shard on \"%s:%u\"", nodeName, nodePort))); } if (placementCount >= replicationFactor) { break; } } /* check if we created enough shard replicas */ if (placementCount < replicationFactor) { ereport(ERROR, (errmsg("could not satisfy specified replication factor"), errdetail("Created %d shard replicas, less than the " "requested replication factor of %d.", placementCount, replicationFactor))); } /* insert the shard metadata row along with its min/max values */ minHashTokenText = IntegerToText(shardMinHashToken); maxHashTokenText = IntegerToText(shardMaxHashToken); InsertShardRow(distributedTableId, shardId, shardStorageType, minHashTokenText, maxHashTokenText); } if (QueryCancelPending) { ereport(WARNING, (errmsg("cancel requests are ignored during shard creation"))); QueryCancelPending = false; } RESUME_INTERRUPTS(); PG_RETURN_VOID(); }
/* * AutoVacMain */ NON_EXEC_STATIC void AutoVacMain(int argc, char *argv[]) { ListCell *cell; List *dblist; autovac_dbase *db; TransactionId xidForceLimit; bool for_xid_wrap; sigjmp_buf local_sigjmp_buf; /* we are a postmaster subprocess now */ IsUnderPostmaster = true; am_autovacuum = true; /* MPP-4990: Autovacuum always runs as utility-mode */ Gp_role = GP_ROLE_UTILITY; /* reset MyProcPid */ MyProcPid = getpid(); /* record Start Time for logging */ MyStartTime = time(NULL); /* Identify myself via ps */ init_ps_display("autovacuum process", "", "", ""); SetProcessingMode(InitProcessing); /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (autovacuum probably never has * any child processes, but for consistency we make all postmaster * child processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Set up signal handlers. We operate on databases much like a regular * backend, so we use the same signal handling. See equivalent code in * tcop/postgres.c. * * Currently, we don't pay attention to postgresql.conf changes that * happen during a single daemon iteration, so we can ignore SIGHUP. */ pqsignal(SIGHUP, SIG_IGN); /* * SIGINT is used to signal cancelling the current table's vacuum; SIGTERM * means abort and exit cleanly, and SIGQUIT means abandon ship. */ pqsignal(SIGINT, StatementCancelHandler); pqsignal(SIGTERM, die); pqsignal(SIGQUIT, quickdie); pqsignal(SIGALRM, handle_sig_alarm); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, procsignal_sigusr1_handler); /* We don't listen for async notifies */ pqsignal(SIGUSR2, SIG_IGN); pqsignal(SIGFPE, FloatExceptionHandler); pqsignal(SIGCHLD, SIG_DFL); /* Early initialization */ BaseInit(); /* * Create a per-backend PGPROC struct in shared memory, except in the * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do * this before we can use LWLocks (and in the EXEC_BACKEND case we already * had to do some stuff with LWLocks). */ #ifndef EXEC_BACKEND InitProcess(); #endif /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Prevents interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * We can now go away. Note that because we called InitProcess, a * callback was registered to do ProcKill, which will clean up * necessary state. */ proc_exit(0); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; PG_SETMASK(&UnBlockSig); /* * Force zero_damaged_pages OFF in the autovac process, even if it is set * in postgresql.conf. We don't really want such a dangerous option being * applied non-interactively. */ SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE); /* Get a list of databases */ dblist = autovac_get_database_list(); /* * Determine the oldest datfrozenxid/relfrozenxid that we will allow * to pass without forcing a vacuum. (This limit can be tightened for * particular tables, but not loosened.) */ recentXid = ReadNewTransactionId(); xidForceLimit = recentXid - autovacuum_freeze_max_age; /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */ if (xidForceLimit < FirstNormalTransactionId) xidForceLimit -= FirstNormalTransactionId; /* * Choose a database to connect to. We pick the database that was least * recently auto-vacuumed, or one that needs vacuuming to prevent Xid * wraparound-related data loss. If any db at risk of wraparound is * found, we pick the one with oldest datfrozenxid, * independently of autovacuum times. * * Note that a database with no stats entry is not considered, except for * Xid wraparound purposes. The theory is that if no one has ever * connected to it since the stats were last initialized, it doesn't need * vacuuming. * * XXX This could be improved if we had more info about whether it needs * vacuuming before connecting to it. Perhaps look through the pgstats * data for the database's tables? One idea is to keep track of the * number of new and dead tuples per database in pgstats. However it * isn't clear how to construct a metric that measures that and not cause * starvation for less busy databases. */ db = NULL; for_xid_wrap = false; foreach(cell, dblist) { autovac_dbase *tmp = lfirst(cell); /* Find pgstat entry if any */ tmp->entry = pgstat_fetch_stat_dbentry(tmp->oid); /* Check to see if this one is at risk of wraparound */ if (TransactionIdPrecedes(tmp->frozenxid, xidForceLimit)) { if (db == NULL || TransactionIdPrecedes(tmp->frozenxid, db->frozenxid)) db = tmp; for_xid_wrap = true; continue; } else if (for_xid_wrap) continue; /* ignore not-at-risk DBs */ /* * Otherwise, skip a database with no pgstat entry; it means it * hasn't seen any activity. */ if (!tmp->entry) continue; /* * Remember the db with oldest autovac time. (If we are here, * both tmp->entry and db->entry must be non-null.) */ if (db == NULL || tmp->entry->last_autovac_time < db->entry->last_autovac_time) db = tmp; }
/* * Workfile-manager specific function to clean up before releasing a * workfile set from the cache. * */ static void workfile_mgr_cleanup_set(const void *resource) { workfile_set *work_set = (workfile_set *) resource; /* * We have to make this callback function return cleanly ALL the * time. It shouldn't throw an exception. * We must try to clean up as much as we can in the callback, and * then never be called again. * This means holding interrupts, catching and handling all exceptions. */ if (work_set->on_disk) { ereport(gp_workfile_caching_loglevel, (errmsg("workfile mgr cleanup deleting set: key=0x%0xd, size=" INT64_FORMAT " in_progress_size=" INT64_FORMAT " path=%s", work_set->key, work_set->size, work_set->in_progress_size, work_set->path), errprintstack(true))); Assert(NULL == work_set->set_plan); PG_TRY(); { #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( WorkfileCleanupSet, DDLNotSpecified, "", /* databaseName */ "" /* tableName */ ); #endif /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); workfile_mgr_delete_set_directory(work_set->path); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); } PG_CATCH(); { elog(LOG, "Cleaning up workfile set directory path=%s failed. Proceeding", work_set->path); /* We're not re-throwing the error. Otherwise we'll end up having * to clean up again, probably failing again. */ } PG_END_TRY(); /* * The most accurate size of a workset is recorded in work_set->in_progress_size. * work_set->size is only updated when we close a file, so it lags behind */ Assert(work_set->in_progress_size >= work_set->size); int64 size_to_delete = work_set->in_progress_size; elog(gp_workfile_caching_loglevel, "Subtracting " INT64_FORMAT " from workfile diskspace", size_to_delete); /* * When subtracting the size of this workset from our accounting, * only update the per-query counter if we created the workset. * In that case, the state is ACQUIRED, otherwise is CACHED or DELETED */ CacheEntry *cacheEntry = CACHE_ENTRY_HEADER(resource); bool update_query_space = (cacheEntry->state == CACHE_ENTRY_ACQUIRED); WorkfileDiskspace_Commit(0, size_to_delete, update_query_space); } else { /* Non-physical workfile set, we need to free up the plan memory */ if (NULL != work_set->set_plan->serialized_plan) { pfree(work_set->set_plan->serialized_plan); } if (NULL != work_set->set_plan) { pfree(work_set->set_plan); } } }
/* * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode * * If the lock is not available, return FALSE with no side-effects. * * If successful, cancel/die interrupts are held off until lock release. */ bool LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode) { volatile LWLock *lock = &(LWLockArray[lockid].lock); #if LWLOCK_LOCK_PARTS > 1 volatile LWLockPart *part = LWLOCK_PART(lock, lockid, MyBackendId); #endif bool mustwait; PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, lock); /* Ensure we will have room to remember the lock */ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) elog(ERROR, "too many LWLocks taken"); /* * Lock out cancel/die interrupts until we exit the code section protected * by the LWLock. This ensures that interrupts will not interfere with * manipulations of data structures in shared memory. */ HOLD_INTERRUPTS(); if (mode == LW_SHARED) { #ifdef LWLOCK_PART_SHARED_OPS_ATOMIC /* Increment shared counter partition. If there's no contention, * this is sufficient to take the lock */ LWLOCK_PART_SHARED_POSTINC_ATOMIC(lock, lockid, part, MyBackendId); LWLOCK_PART_SHARED_FENCE(); /* A concurrent exclusive locking attempt does the following * three steps * 1) Acquire mutex * 2) Check shared counter partitions for readers. * 3a) If found add proc to wait queue, block, restart at (1) * 3b) If not found, set exclusive flag, continue with (4) * 4) Enter protected section * The fence after the atomic add above ensures that no further * such attempt can proceed to (3b) or beyond. There may be * pre-existing exclusive locking attempts at step (3b) or beyond, * but we can recognize those by either the mutex being taken, or * the exclusive flag being set. Conversely, if we see neither, we * may proceed and enter the protected section. * * FIXME: This doesn't work if slock_t is a struct or doesn't * use 0 for state "unlocked". */ if ((lock->mutex == 0) && (lock->exclusive == 0)) goto lock_acquired; /* At this point, we don't know if the concurrent exclusive locker * has proceeded to (3b) or blocked. We must take the mutex and * re-check */ #endif /* LWLOCK_PART_SHARED_OPS_ATOMIC */ /* Acquire mutex. Time spent holding mutex should be short! */ SpinLockAcquire(&lock->mutex); if (lock->exclusive == 0) { #ifdef LWLOCK_PART_SHARED_OPS_ATOMIC /* Already incremented the shared counter partition above */ #else lock->shared++; #endif mustwait = false; } else { #ifdef LWLOCK_PART_SHARED_OPS_ATOMIC /* Must undo shared counter partition increment. Note that * we *need* to do that while holding the mutex. Otherwise, * the exclusive lock could be released and attempted to be * re-acquired before we undo the increment. That attempt * would then block, even though there'd be no lock holder * left */ LWLOCK_PART_SHARED_POSTDEC_ATOMIC(lock, lockid, part, MyBackendId); #endif mustwait = true; } } else { /* Step (1). Acquire mutex. Time spent holding mutex should be * short! */ SpinLockAcquire(&lock->mutex); if (lock->exclusive == 0) { /* Step (2). Check for shared lockers. This surely happens * after (1), otherwise SpinLockAcquire() is broken. Lock * acquire semantics demand that no load must be re-ordered * from after a lock acquisition to before, for obvious * reasons. */ LWLOCK_IS_SHARED(mustwait, lock, lockid); if (!mustwait) { /* Step (3a). Set exclusive flag. This surely happens * after (2) because it depends on the result of (2), * no matter how much reordering is going on here. */ lock->exclusive++; } } else mustwait = true; } /* We are done updating shared state of the lock itself. */ SpinLockRelease(&lock->mutex); if (mustwait) { /* Failed to get lock, so release interrupt holdoff */ RESUME_INTERRUPTS(); LOG_LWDEBUG("LWLockConditionalAcquire", lockid, "failed"); TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(lockid, mode); return false; } #ifdef LWLOCK_PART_SHARED_OPS_ATOMIC lock_acquired: #endif TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(lockid, mode); /* Add lock to list of locks held by this backend */ held_lwlocks[num_held_lwlocks] = lockid; held_lwlocks_mode[num_held_lwlocks] = mode; ++num_held_lwlocks; return true; }
/* * Main entry point for bgwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (bgwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * Note: we deliberately ignore SIGTERM, because during a standard Unix * system shutdown cycle, init will SIGTERM all processes at once. We * want to wait for the backends to exit, whereupon the postmaster will * tell us it's okay to shut down (via SIGUSR2). * * SIGUSR1 is presently unused; keep it spare in case someday we want this * process to participate in sinval messaging. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for sinval */ pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ #ifdef HAVE_SIGPROCMASK sigdelset(&BlockSig, SIGQUIT); #else BlockSig &= ~(sigmask(SIGQUIT)); #endif /* * Initialize so that first time-driven event happens at the correct time. */ last_checkpoint_time = last_xlog_switch_time = time(NULL); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* Warn any waiting backends that the checkpoint failed. */ if (ckpt_active) { /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_failed++; bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); ckpt_active = false; } /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { bool do_checkpoint = false; int flags = 0; time_t now; int elapsed_secs; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Process any requests or signals received recently. */ AbsorbFsyncRequests(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (checkpoint_requested) { checkpoint_requested = false; do_checkpoint = true; BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); DumpFreeSpaceMap(0, 0); /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request, but we set the CAUSE_TIME flag * bit even if there is also an external request. */ now = time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) BgWriterStats.m_timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } /* * Do a checkpoint if requested, otherwise do one cycle of * dirty-buffer writing. */ if (do_checkpoint) { /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter * to acknowledge that we've started a new checkpoint. */ SpinLockAcquire(&bgs->ckpt_lck); flags |= bgs->ckpt_flags; bgs->ckpt_flags = 0; bgs->ckpt_started++; SpinLockRelease(&bgs->ckpt_lck); /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * since the last checkpoint start. Note in particular that this * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ if ((flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg("checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs), errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); /* * Initialize bgwriter-private variables used during checkpoint. */ ckpt_active = true; ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ CreateCheckPoint(flags); /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); /* * Indicate checkpoint completion to any waiting backends. */ SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); ckpt_active = false; /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven checkpoints * happen at a predictable spacing. */ last_checkpoint_time = now; } else BgBufferSync(); /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); /* Nap for the configured time. */ BgWriterNap(); } }
inline void * AbstractionLayer::Allocator::internalAllocate(void *inPtr, const size_t inSize) const { // Avoid warning that inPtr is not used if R == NewAllocation (void) inPtr; void *ptr; bool errorOccurred = false; MemoryContext oldContext = NULL; MemoryContext aggContext = NULL; if (F == dbal::ReturnNULL) { /* * HOLD_INTERRUPTS() and RESUME_INTERRUPTS() only change the value of a * global variable but have no other side effects. In particular, they * do not call CHECK_INTERRUPTS(). Hence, we are save to use these * macros outside of a PG_TRY() block. */ HOLD_INTERRUPTS(); } PG_TRY(); { if (MC == dbal::AggregateContext) { if (!AggCheckCallContext(fcinfo, &aggContext)) errorOccurred = true; else { oldContext = MemoryContextSwitchTo(aggContext); ptr = (R == Reallocation) ? internalRePalloc<ZM>(inPtr, inSize) : internalPalloc<ZM>(inSize); MemoryContextSwitchTo(oldContext); } } else { ptr = R ? internalRePalloc<ZM>(inPtr, inSize) : internalPalloc<ZM>(inSize); } } PG_CATCH(); { if (F == dbal::ReturnNULL) { /* * This cannot be due to an interrupt, so it's reasonably safe * to assume that the PG exception was a pure memory-allocation * issue. We ignore the error and flush the error state. * Flushing is necessary for leaving the error state (e.g., the memory * context is restored). */ FlushErrorState(); ptr = NULL; } else { /* * PostgreSQL error messages can be stacked. So, it doesn't hurt to add * our own message. After unwinding the C++ stack, the PostgreSQL * exception will be re-thrown into the PostgreSQL C code. * * Throwing C++ exceptions inside a PG_CATCH block is not problematic * per se, but it is good practise to keep the exception mechanisms clearly * separated. */ errorOccurred = true; } } PG_END_TRY(); if (errorOccurred) { PG_TRY(); { // Clean up after ourselves if (oldContext != NULL) MemoryContextSwitchTo(oldContext); } PG_CATCH(); { if (F == dbal::ReturnNULL) { // We tried to clean up after ourselves. If this fails, we can // only ignore the issue. FlushErrorState(); } // Else do nothing. We will add a bad-allocation exception on top of // the existing PostgreSQL exception stack. } PG_END_TRY(); } if (errorOccurred || !ptr) // We do not want to interleave PG exceptions and C++ exceptions. throw std::bad_alloc(); if (F == dbal::ReturnNULL) { RESUME_INTERRUPTS(); } return ptr; }
/* * Main entry point for walwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void WalWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext walwriter_context; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (walwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * We have no particular use for SIGINT at the moment, but seems * reasonable to treat like SIGTERM. */ pqsignal(SIGHUP, WalSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, WalShutdownHandler); /* request shutdown */ pqsignal(SIGTERM, WalShutdownHandler); /* request shutdown */ pqsignal(SIGQUIT, wal_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ pqsignal(SIGUSR2, SIG_IGN); /* not used */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (not clear that * we need this, but may as well have one). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ walwriter_context = AllocSetContextCreate(TopMemoryContext, "Wal Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(walwriter_context); /* * If an exception is encountered, processing resumes here. * * This code is heavily based on bgwriter.c, q.v. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(walwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(walwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { long udelay; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Process any requests or signals received recently. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* Normal exit from the walwriter is here */ proc_exit(0); /* done */ } /* * Do what we're here for... */ XLogBackgroundFlush(); /* * Delay until time to do something more, but fall out of delay * reasonably quickly if signaled. */ udelay = WalWriterDelay * 1000L; while (udelay > 999999L) { if (got_SIGHUP || shutdown_requested) break; pg_usleep(1000000L); udelay -= 1000000L; } if (!(got_SIGHUP || shutdown_requested)) pg_usleep(udelay); } }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. * * The return value indicates whether this function has held off * interrupts -- caller must RESUME_INTERRUPTS() after commit if true. */ bool lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy, List *updated_stats) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; bool heldoff = false; pg_rusage_init(&ru0); /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0) starttime = GetCurrentTimestamp(); if (vacstmt->verbose) elevel = INFO; else elevel = DEBUG2; if (Gp_role == GP_ROLE_DISPATCH) elevel = DEBUG2; /* vacuum and analyze messages aren't interesting from the QD */ #ifdef FAULT_INJECTOR if (vacuumStatement_IsInAppendOnlyDropPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeSegmentFileDropPhase, DDLNotSpecified, "", // databaseName ""); // tableName } if (vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeCleanupPhase, DDLNotSpecified, "", // databaseName ""); // tableName } #endif /* * MPP-23647. Update xid limits for heap as well as appendonly * relations. This allows setting relfrozenxid to correct value * for an appendonly (AO/CO) table. */ vac_strategy = bstrategy; vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit); /* * Execute the various vacuum operations. Appendonly tables are treated * differently. */ if (RelationIsAoRows(onerel) || RelationIsAoCols(onerel)) { lazy_vacuum_aorel(onerel, vacstmt, updated_stats); return false; } vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); /* heap relation */ /* Set threshold for interesting free space = average request size */ /* XXX should we scale it up or down? Adjust vacuum.c too, if so */ vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node); vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, updated_stats); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. * * Note that after we've truncated the heap, it's too late to abort the * transaction; doing so would lose the sinval messages needed to tell * the other backends about the table being shrunk. We prevent interrupts * in that case; caller is responsible for re-enabling them after * committing the transaction. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) { HOLD_INTERRUPTS(); heldoff = true; lazy_truncate_heap(onerel, vacrelstats); } /* Update shared free space map with final free space info */ lazy_update_fsm(onerel, vacrelstats); if (vacrelstats->tot_free_pages > MaxFSMPages) ereport(WARNING, (errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space", get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel)), /* Only suggest VACUUM FULL if > 20% free */ (vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20) ? errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\".") : errhint("Consider increasing the configuration parameter \"max_fsm_pages\"."))); /* Update statistics in pg_class */ vac_update_relstats_from_list(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, vacrelstats->hasindex, FreezeLimit, updated_stats); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, true /*vacrelstats->scanned_all*/, vacstmt->analyze, vacrelstats->rel_tuples); if (gp_indexcheck_vacuum == INDEX_CHECK_ALL || (gp_indexcheck_vacuum == INDEX_CHECK_SYSTEM && PG_CATALOG_NAMESPACE == RelationGetNamespace(onerel))) { int i; for (i = 0; i < nindexes; i++) { if (Irel[i]->rd_rel->relam == BTREE_AM_OID) _bt_validate_vacuum(Irel[i], onerel, OldestXmin); } } /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { if (Log_autovacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(), Log_autovacuum_min_duration)) ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" "pages: %d removed, %d remain\n" "tuples: %.0f removed, %.0f remain\n" "system usage: %s", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, vacrelstats->pages_removed, vacrelstats->rel_pages, vacrelstats->tuples_deleted, vacrelstats->rel_tuples, pg_rusage_show(&ru0)))); } return heldoff; }
/* * master_copy_shard_placement implements a user-facing UDF to copy data from * a healthy (source) node to an inactive (target) node. To accomplish this it * entirely recreates the table structure before copying all data. During this * time all modifications are paused to the shard. After successful repair, the * inactive placement is marked healthy and modifications may continue. If the * repair fails at any point, this function throws an error, leaving the node * in an unhealthy state. */ Datum master_copy_shard_placement(PG_FUNCTION_ARGS) { int64 shardId = PG_GETARG_INT64(0); text *sourceNodeName = PG_GETARG_TEXT_P(1); int32 sourceNodePort = PG_GETARG_INT32(2); text *targetNodeName = PG_GETARG_TEXT_P(3); int32 targetNodePort = PG_GETARG_INT32(4); ShardInterval *shardInterval = LoadShardInterval(shardId); Oid distributedTableId = shardInterval->relationId; List *shardPlacementList = NIL; ShardPlacement *sourcePlacement = NULL; ShardPlacement *targetPlacement = NULL; List *ddlCommandList = NIL; bool recreated = false; bool dataCopied = false; /* * By taking an exclusive lock on the shard, we both stop all modifications * (INSERT, UPDATE, or DELETE) and prevent concurrent repair operations from * being able to operate on this shard. */ LockShard(shardId, ExclusiveLock); shardPlacementList = LoadShardPlacementList(shardId); sourcePlacement = SearchShardPlacementInList(shardPlacementList, sourceNodeName, sourceNodePort); if (sourcePlacement->shardState != STATE_FINALIZED) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("source placement must be in finalized state"))); } targetPlacement = SearchShardPlacementInList(shardPlacementList, targetNodeName, targetNodePort); if (targetPlacement->shardState != STATE_INACTIVE) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("target placement must be in inactive state"))); } /* retrieve the DDL commands for the table and run them */ ddlCommandList = RecreateTableDDLCommandList(distributedTableId, shardId); recreated = ExecuteRemoteCommandList(targetPlacement->nodeName, targetPlacement->nodePort, ddlCommandList); if (!recreated) { ereport(ERROR, (errmsg("could not recreate shard table"), errhint("Consult recent messages in the server logs for " "details."))); } HOLD_INTERRUPTS(); dataCopied = CopyDataFromFinalizedPlacement(distributedTableId, shardId, sourcePlacement, targetPlacement); if (!dataCopied) { ereport(ERROR, (errmsg("could not copy shard data"), errhint("Consult recent messages in the server logs for " "details."))); } /* the placement is repaired, so return to finalized state */ DeleteShardPlacementRow(targetPlacement->id); InsertShardPlacementRow(targetPlacement->id, targetPlacement->shardId, STATE_FINALIZED, targetPlacement->nodeName, targetPlacement->nodePort); RESUME_INTERRUPTS(); PG_RETURN_VOID(); }
/** * This method is called after fork of the sweeper process. It sets up signal * handlers and does initialization that is required by a postgres backend. */ NON_EXEC_STATIC void BackoffSweeperMain(int argc, char *argv[]) { sigjmp_buf local_sigjmp_buf; IsUnderPostmaster = true; isSweeperProcess = true; /* Stay away from PMChildSlot */ MyPMChildSlot = -1; /* reset MyProcPid */ MyProcPid = getpid(); /* Lose the postmaster's on-exit routines */ on_exit_reset(); /* Identify myself via ps */ init_ps_display("sweeper process", "", "", ""); SetProcessingMode(InitProcessing); /* * Set up signal handlers. We operate on databases much like a regular * backend, so we use the same signal handling. See equivalent code in * tcop/postgres.c. */ pqsignal(SIGHUP, SIG_IGN); pqsignal(SIGINT, SIG_IGN); pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); pqsignal(SIGTERM, die); pqsignal(SIGQUIT, quickdie); pqsignal(SIGUSR2, BackoffRequestShutdown); pqsignal(SIGFPE, FloatExceptionHandler); pqsignal(SIGCHLD, SIG_DFL); /* * Copied from bgwriter */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Sweeper process"); /* Early initialization */ BaseInit(); /* See InitPostgres()... */ InitProcess(); SetProcessingMode(NormalProcessing); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Prevents interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * We can now go away. Note that because we'll call InitProcess, a * callback will be registered to do ProcKill, which will clean up * necessary state. */ proc_exit(0); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; PG_SETMASK(&UnBlockSig); MyBackendId = InvalidBackendId; /* main loop */ BackoffSweeperLoop(); /* One iteration done, go away */ proc_exit(0); }
/* * Main entry point for bgwriter process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; bool prev_hibernate; /* * Properly accept or ignore signals the postmaster might send us. * * bgwriter doesn't participate in ProcSignal signalling, but a SIGUSR1 * handler is still needed for latch wakeups. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, ReqShutdownHandler); /* shutdown */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, bgwriter_sigusr1_handler); pqsignal(SIGUSR2, SIG_IGN); /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * We just started, assume there has been either a shutdown or * end-of-recovery snapshot. */ last_snapshot_ts = GetCurrentTimestamp(); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); /* Report wait end here, when there is no further possibility of wait */ pgstat_report_wait_end(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Reset hibernation state after any error. */ prev_hibernate = false; /* * Loop forever */ for (;;) { bool can_hibernate; int rc; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Do one cycle of dirty-buffer writing. */ can_hibernate = BgBufferSync(); /* * Send off activity statistics to the stats collector */ pgstat_send_bgwriter(); if (FirstCallSinceLastCheckpoint()) { /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); } /* * Log a new xl_running_xacts every now and then so replication can * get into a consistent state faster (think of suboverflowed * snapshots) and clean up resources (locks, KnownXids*) more * frequently. The costs of this are relatively low, so doing it 4 * times (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine. * * We assume the interval for writing xl_running_xacts is * significantly bigger than BgWriterDelay, so we don't complicate the * overall timeout handling but just assume we're going to get called * often enough even if hibernation mode is active. It's not that * important that log_snap_interval_ms is met strictly. To make sure * we're not waking the disk up unnecessarily on an idle system we * check whether there has been any WAL inserted since the last time * we've logged a running xacts. * * We do this logging in the bgwriter as its the only process that is * run regularly and returns to its mainloop all the time. E.g. * Checkpointer, when active, is barely ever in its mainloop and thus * makes it hard to log regularly. */ if (XLogStandbyInfoActive() && !RecoveryInProgress()) { TimestampTz timeout = 0; TimestampTz now = GetCurrentTimestamp(); timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, LOG_SNAPSHOT_INTERVAL_MS); /* * only log if enough time has passed and some xlog record has * been inserted. */ if (now >= timeout && last_snapshot_lsn != GetXLogInsertRecPtr()) { last_snapshot_lsn = LogStandbySnapshot(); last_snapshot_ts = now; } } /* * Sleep until we are signaled or BgWriterDelay has elapsed. * * Note: the feedback control loop in BgBufferSync() expects that we * will call it every BgWriterDelay msec. While it's not critical for * correctness that that be exact, the feedback loop might misbehave * if we stray too far from that. Hence, avoid loading this process * down with latch events that are likely to happen frequently during * normal operation. */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay /* ms */ ); /* * If no latch event and BgBufferSync says nothing's happening, extend * the sleep in "hibernation" mode, where we sleep for much longer * than bgwriter_delay says. Fewer wakeups save electricity. When a * backend starts using buffers again, it will wake us up by setting * our latch. Because the extra sleep will persist only as long as no * buffer allocations happen, this should not distort the behavior of * BgBufferSync's control loop too badly; essentially, it will think * that the system-wide idle interval didn't exist. * * There is a race condition here, in that a backend might allocate a * buffer between the time BgBufferSync saw the alloc count as zero * and the time we call StrategyNotifyBgWriter. While it's not * critical that we not hibernate anyway, we try to reduce the odds of * that by only hibernating when BgBufferSync says nothing's happening * for two consecutive cycles. Also, we mitigate any possible * consequences of a missed wakeup by not hibernating forever. */ if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate) { /* Ask for notification at next buffer allocation */ StrategyNotifyBgWriter(MyProc->pgprocno); /* Sleep ... */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay * HIBERNATE_FACTOR); /* Reset the notification request in case we timed out */ StrategyNotifyBgWriter(-1); } /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); prev_hibernate = can_hibernate; } }
/* * LWLockAcquire - acquire a lightweight lock in the specified mode * * If the lock is not available, sleep until it is. * * Side effect: cancel/die interrupts are held off until lock release. */ void LWLockAcquire(LWLockId lockid, LWLockMode mode) { volatile LWLock *lock = &(LWLockArray[lockid].lock); PGPROC *proc = MyProc; bool retry = false; int extraWaits = 0; PRINT_LWDEBUG("LWLockAcquire", lockid, lock); #ifdef LWLOCK_STATS /* Set up local count state first time through in a given process */ if (counts_for_pid != MyProcPid) { int *LWLockCounter = (int *) ((char *) LWLockArray - 2 * sizeof(int)); int numLocks = LWLockCounter[1]; sh_acquire_counts = calloc(numLocks, sizeof(int)); ex_acquire_counts = calloc(numLocks, sizeof(int)); block_counts = calloc(numLocks, sizeof(int)); counts_for_pid = MyProcPid; on_shmem_exit(print_lwlock_stats, 0); } /* Count lock acquisition attempts */ if (mode == LW_EXCLUSIVE) ex_acquire_counts[lockid]++; else sh_acquire_counts[lockid]++; #endif /* LWLOCK_STATS */ /* * We can't wait if we haven't got a PGPROC. This should only occur * during bootstrap or shared memory initialization. Put an Assert here * to catch unsafe coding practices. */ Assert(!(proc == NULL && IsUnderPostmaster)); /* Ensure we will have room to remember the lock */ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) elog(ERROR, "too many LWLocks taken"); /* * Lock out cancel/die interrupts until we exit the code section protected * by the LWLock. This ensures that interrupts will not interfere with * manipulations of data structures in shared memory. */ HOLD_INTERRUPTS(); /* * Loop here to try to acquire lock after each time we are signaled by * LWLockRelease. * * NOTE: it might seem better to have LWLockRelease actually grant us the * lock, rather than retrying and possibly having to go back to sleep. But * in practice that is no good because it means a process swap for every * lock acquisition when two or more processes are contending for the same * lock. Since LWLocks are normally used to protect not-very-long * sections of computation, a process needs to be able to acquire and * release the same lock many times during a single CPU time slice, even * in the presence of contention. The efficiency of being able to do that * outweighs the inefficiency of sometimes wasting a process dispatch * cycle because the lock is not free when a released waiter finally gets * to run. See pgsql-hackers archives for 29-Dec-01. */ for (;;) { bool mustwait; /* Acquire mutex. Time spent holding mutex should be short! */ SpinLockAcquire(&lock->mutex); /* If retrying, allow LWLockRelease to release waiters again */ if (retry) lock->releaseOK = true; /* If I can get the lock, do so quickly. */ if (mode == LW_EXCLUSIVE) { if (lock->exclusive == 0 && lock->shared == 0) { lock->exclusive++; mustwait = false; } else mustwait = true; } else { if (lock->exclusive == 0) { lock->shared++; mustwait = false; } else mustwait = true; } if (!mustwait) break; /* got the lock */ /* * Add myself to wait queue. * * If we don't have a PGPROC structure, there's no way to wait. This * should never occur, since MyProc should only be null during shared * memory initialization. */ if (proc == NULL) elog(PANIC, "cannot wait without a PGPROC structure"); proc->lwWaiting = true; proc->lwExclusive = (mode == LW_EXCLUSIVE); proc->lwWaitLink = NULL; if (lock->head == NULL) lock->head = proc; else lock->tail->lwWaitLink = proc; lock->tail = proc; /* Can release the mutex now */ SpinLockRelease(&lock->mutex); /* * Wait until awakened. * * Since we share the process wait semaphore with the regular lock * manager and ProcWaitForSignal, and we may need to acquire an LWLock * while one of those is pending, it is possible that we get awakened * for a reason other than being signaled by LWLockRelease. If so, * loop back and wait again. Once we've gotten the LWLock, * re-increment the sema by the number of additional signals received, * so that the lock manager or signal manager will see the received * signal when it next waits. */ LOG_LWDEBUG("LWLockAcquire", lockid, "waiting"); #ifdef LWLOCK_STATS block_counts[lockid]++; #endif TRACE_POSTGRESQL_LWLOCK_WAIT_START(lockid, mode); for (;;) { /* "false" means cannot accept cancel/die interrupt here. */ PGSemaphoreLock(&proc->sem, false); if (!proc->lwWaiting) break; extraWaits++; } TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(lockid, mode); LOG_LWDEBUG("LWLockAcquire", lockid, "awakened"); /* Now loop back and try to acquire lock again. */ retry = true; } /* We are done updating shared state of the lock itself. */ SpinLockRelease(&lock->mutex); TRACE_POSTGRESQL_LWLOCK_ACQUIRE(lockid, mode); /* Add lock to list of locks held by this backend */ held_lwlocks[num_held_lwlocks++] = lockid; /* * Fix the process wait semaphore's count for any absorbed wakeups. */ while (extraWaits-- > 0) PGSemaphoreUnlock(&proc->sem); }
void FileRepResetPeer_Main(void) { /* BASIC PROCESS SETUP */ FileRepReset_ConfigureSignals(); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding and comments about how the error * handling works. */ sigjmp_buf local_sigjmp_buf; if (sigsetjmp(local_sigjmp_buf, 1) != 0) { HOLD_INTERRUPTS(); EmitErrorReport(); proc_exit(EXIT_CODE_SHOULD_ENTER_FAULT); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; PG_SETMASK(&UnBlockSig); /** NOW DO THE ACTUAL WORK */ char messageFromPeer[MESSAGE_FROM_PEER_BUF_SIZE]; char resetNumberFromPeer[MESSAGE_FROM_PEER_BUF_SIZE]; char resetNumberThatIndicatesResetComplete[MESSAGE_FROM_PEER_BUF_SIZE]; struct addrinfo *addrList = NULL; char portStr[100]; PrimaryMirrorModeTransitionArguments args = primaryMirrorGetArgumentsFromLocalMemory(); Assert(args.mode == PMModePrimarySegment || args.mode == PMModeMirrorSegment); snprintf(portStr, sizeof(portStr), "%d", args.peerPostmasterPort); if (! determineTargetHost(&addrList, args.peerAddress, portStr)) { elog(WARNING, "during reset, unable to look up address for peer host to coordinate reset; " "will transition to fault state."); proc_exit(EXIT_CODE_SHOULD_ENTER_FAULT); } sendMessageToPeerAndExitIfProblem(addrList, "beginPostmasterReset", messageFromPeer, resetNumberThatIndicatesResetComplete); for ( ;; ) { pg_usleep(10 * 1000L); /* 10 ms */ sendMessageToPeerAndExitIfProblem(addrList, "getPostmasterResetStatus", messageFromPeer, resetNumberFromPeer ); if (strequals(messageFromPeer, RESET_STATUS_IS_IN_RESET_PIVOT_POINT)) { if (args.mode == PMModeMirrorSegment) { /** * peer is in the reset pivot point, we can break out of our checking loop and * thus exit with a code telling the postmaster to begin the startup sequence again * * this is only done on the mirror as currently the mirror must execute the startup sequence * before the primary */ elog(DEBUG1, "peer reset: primary peer has reached reset point"); break; } } else if (strequals(messageFromPeer, RESET_STATUS_IS_RUNNING)) { /** it's running -- is it >= than the reset number that indicates reset complete one */ if (strcmp( resetNumberFromPeer, resetNumberThatIndicatesResetComplete) >= 0) { /** yes, the reset is complete and so we can quit and do a restart */ elog(DEBUG1, "peer reset: mirror peer reset is complete"); break; } } } proc_exit(EXIT_CODE_SHOULD_RESTART_SHMEM_CLEANLY); }
/* * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode * * If the lock is not available, return FALSE with no side-effects. * * If successful, cancel/die interrupts are held off until lock release. */ bool LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode) { volatile LWLock *lock = &(LWLockArray[lockid].lock); bool mustwait; PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, lock); /* Ensure we will have room to remember the lock */ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) elog(ERROR, "too many LWLocks taken"); /* * Lock out cancel/die interrupts until we exit the code section protected * by the LWLock. This ensures that interrupts will not interfere with * manipulations of data structures in shared memory. */ HOLD_INTERRUPTS(); /* Acquire mutex. Time spent holding mutex should be short! */ SpinLockAcquire(&lock->mutex); /* If I can get the lock, do so quickly. */ if (mode == LW_EXCLUSIVE) { if (lock->exclusive == 0 && lock->shared == 0) { lock->exclusive++; mustwait = false; } else mustwait = true; } else { if (lock->exclusive == 0) { lock->shared++; mustwait = false; } else mustwait = true; } /* We are done updating shared state of the lock itself. */ SpinLockRelease(&lock->mutex); if (mustwait) { /* Failed to get lock, so release interrupt holdoff */ RESUME_INTERRUPTS(); LOG_LWDEBUG("LWLockConditionalAcquire", lockid, "failed"); TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(lockid, mode); } else { /* Add lock to list of locks held by this backend */ held_lwlocks[num_held_lwlocks++] = lockid; TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(lockid, mode); } return !mustwait; }
/* * LWLockAcquireOrWait - Acquire lock, or wait until it's free * * The semantics of this function are a bit funky. If the lock is currently * free, it is acquired in the given mode, and the function returns true. If * the lock isn't immediately free, the function waits until it is released * and returns false, but does not acquire the lock. * * This is currently used for WALWriteLock: when a backend flushes the WAL, * holding WALWriteLock, it can flush the commit records of many other * backends as a side-effect. Those other backends need to wait until the * flush finishes, but don't need to acquire the lock anymore. They can just * wake up, observe that their records have already been flushed, and return. */ bool LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) { PGPROC *proc = MyProc; bool mustwait; int extraWaits = 0; #ifdef LWLOCK_STATS lwlock_stats *lwstats; #endif PRINT_LWDEBUG("LWLockAcquireOrWait", lock); #ifdef LWLOCK_STATS lwstats = get_lwlock_stats_entry(lock); #endif /* Ensure we will have room to remember the lock */ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) elog(ERROR, "too many LWLocks taken"); /* * Lock out cancel/die interrupts until we exit the code section protected * by the LWLock. This ensures that interrupts will not interfere with * manipulations of data structures in shared memory. */ HOLD_INTERRUPTS(); /* Acquire mutex. Time spent holding mutex should be short! */ SpinLockAcquire(&lock->mutex); /* If I can get the lock, do so quickly. */ if (mode == LW_EXCLUSIVE) { if (lock->exclusive == 0 && lock->shared == 0) { lock->exclusive++; mustwait = false; } else mustwait = true; } else { if (lock->exclusive == 0) { lock->shared++; mustwait = false; } else mustwait = true; } if (mustwait) { /* * Add myself to wait queue. * * If we don't have a PGPROC structure, there's no way to wait. This * should never occur, since MyProc should only be null during shared * memory initialization. */ if (proc == NULL) elog(PANIC, "cannot wait without a PGPROC structure"); proc->lwWaiting = true; proc->lwWaitMode = LW_WAIT_UNTIL_FREE; proc->lwWaitLink = NULL; if (lock->head == NULL) lock->head = proc; else lock->tail->lwWaitLink = proc; lock->tail = proc; /* Can release the mutex now */ SpinLockRelease(&lock->mutex); /* * Wait until awakened. Like in LWLockAcquire, be prepared for bogus * wakups, because we share the semaphore with ProcWaitForSignal. */ LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), "waiting"); #ifdef LWLOCK_STATS lwstats->block_count++; #endif TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), T_ID(lock), mode); for (;;) { /* "false" means cannot accept cancel/die interrupt here. */ PGSemaphoreLock(&proc->sem, false); if (!proc->lwWaiting) break; extraWaits++; } TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock), mode); LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), "awakened"); } else { /* We are done updating shared state of the lock itself. */ SpinLockRelease(&lock->mutex); } /* * Fix the process wait semaphore's count for any absorbed wakeups. */ while (extraWaits-- > 0) PGSemaphoreUnlock(&proc->sem); if (mustwait) { /* Failed to get lock, so release interrupt holdoff */ RESUME_INTERRUPTS(); LOG_LWDEBUG("LWLockAcquireOrWait", T_NAME(lock), T_ID(lock), "failed"); TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), T_ID(lock), mode); } else { /* Add lock to list of locks held by this backend */ held_lwlocks[num_held_lwlocks++] = lock; TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), T_ID(lock), mode); } return !mustwait; }
/* * LWLockAcquire - acquire a lightweight lock in the specified mode * * If the lock is not available, sleep until it is. * * Side effect: cancel/die interrupts are held off until lock release. */ void LWLockAcquire(LWLockId lockid, LWLockMode mode) { volatile LWLock *lock = &(LWLockArray[lockid].lock); #if LWLOCK_LOCK_PARTS > 1 volatile LWLockPart *part = LWLOCK_PART(lock, lockid, MyBackendId); #endif PGPROC *proc = MyProc; bool retry = false; int extraWaits = 0; PRINT_LWDEBUG("LWLockAcquire", lockid, lock); #ifdef LWLOCK_STATS /* Set up local count state first time through in a given process */ if (counts_for_pid != MyProcPid) { int *LWLockCounter = (int *) ((char *) LWLockArray - 2 * sizeof(int)); int numLocks = LWLockCounter[1]; sh_acquire_counts = calloc(numLocks, sizeof(int)); ex_acquire_counts = calloc(numLocks, sizeof(int)); block_counts = calloc(numLocks, sizeof(int)); counts_for_pid = MyProcPid; on_shmem_exit(print_lwlock_stats, 0); } /* Count lock acquisition attempts */ if (mode == LW_EXCLUSIVE) ex_acquire_counts[lockid]++; else sh_acquire_counts[lockid]++; #endif /* LWLOCK_STATS */ /* * We can't wait if we haven't got a PGPROC. This should only occur * during bootstrap or shared memory initialization. Put an Assert here * to catch unsafe coding practices. */ Assert(!(proc == NULL && IsUnderPostmaster)); /* Ensure we will have room to remember the lock */ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) elog(ERROR, "too many LWLocks taken"); /* * Lock out cancel/die interrupts until we exit the code section protected * by the LWLock. This ensures that interrupts will not interfere with * manipulations of data structures in shared memory. */ HOLD_INTERRUPTS(); /* * Loop here to try to acquire lock after each time we are signaled by * LWLockRelease. * * NOTE: it might seem better to have LWLockRelease actually grant us the * lock, rather than retrying and possibly having to go back to sleep. But * in practice that is no good because it means a process swap for every * lock acquisition when two or more processes are contending for the same * lock. Since LWLocks are normally used to protect not-very-long * sections of computation, a process needs to be able to acquire and * release the same lock many times during a single CPU time slice, even * in the presence of contention. The efficiency of being able to do that * outweighs the inefficiency of sometimes wasting a process dispatch * cycle because the lock is not free when a released waiter finally gets * to run. See pgsql-hackers archives for 29-Dec-01. */ for (;;) { bool mustwait; if (mode == LW_SHARED) { #ifdef LWLOCK_PART_SHARED_OPS_ATOMIC /* Increment shared counter partition. If there's no contention, * this is sufficient to take the lock */ LWLOCK_PART_SHARED_POSTINC_ATOMIC(lock, lockid, part, MyBackendId); LWLOCK_PART_SHARED_FENCE(); /* A concurrent exclusive locking attempt does the following * three steps * 1) Acquire mutex * 2) Check shared counter partitions for readers. * 3a) If found add proc to wait queue, block, restart at (1) * 3b) If not found, set exclusive flag, continue with (4) * 4) Enter protected section * The fence after the atomic add above ensures that no further * such attempt can proceed to (3b) or beyond. There may be * pre-existing exclusive locking attempts at step (3b) or beyond, * but we can recognize those by either the mutex being taken, or * the exclusive flag being set. Conversely, if we see neither, we * may proceed and enter the protected section. * * FIXME: This doesn't work if slock_t is a struct or doesn't * use 0 for state "unlocked". */ if ((lock->mutex == 0) && (lock->exclusive == 0)) { /* If retrying, allow LWLockRelease to release waiters again. * Usually this happens after we acquired the mutex, but if * we skip that, we still need to set releaseOK. * * Acquiring the mutex here is not really an option - if many * reader are awoken simultaneously by an exclusive unlock, * that would be a source of considerable contention. * * Fotunately, this is safe even without the mutex. First, * there actually cannot be any non-fast path unlocking * attempt in progress, because we'd then either still see * the exclusive flag set or the mutex being taken. And * even if there was, and such an attempt cleared the flag * immediately after we set it, it'd also wake up some waiter * who'd then re-set the flag. * * The only reason to do this here, and not directly * after returning from PGSemaphoreLock(), is that it seems * benefical to make SpinLockAcquire() the first thing to * touch the lock if possible, in case we acquire the spin * lock at all. That way, the cache line doesn't go through * a possible shared state, but instead directly to exclusive. * On Opterons at least, there seems to be a difference, c.f. * the comment above tas() for x86_64 in s_lock.h */ if (retry && !lock->releaseOK) lock->releaseOK = true; goto lock_acquired; } /* At this point, we don't know if the concurrent exclusive locker * has proceeded to (3b) or blocked. We must take the mutex and * re-check */ #endif /* LWLOCK_PART_SHARED_OPS_ATOMIC */ /* Acquire mutex. Time spent holding mutex should be short! */ SpinLockAcquire(&lock->mutex); if (lock->exclusive == 0) { #ifdef LWLOCK_PART_SHARED_OPS_ATOMIC /* Already incremented the shared counter partition above */ #else lock->shared++; #endif mustwait = false; } else { #ifdef LWLOCK_PART_SHARED_OPS_ATOMIC /* Must undo shared counter partition increment. Note that * we *need* to do that while holding the mutex. Otherwise, * the exclusive lock could be released and attempted to be * re-acquired before we undo the increment. That attempt * would then block, even though there'd be no lock holder * left */ LWLOCK_PART_SHARED_POSTDEC_ATOMIC(lock, lockid, part, MyBackendId); #endif mustwait = true; } } else { /* Step (1). Acquire mutex. Time spent holding mutex should be * short! */ SpinLockAcquire(&lock->mutex); if (lock->exclusive == 0) { /* Step (2). Check for shared lockers. This surely happens * after (1), otherwise SpinLockAcquire() is broken. Lock * acquire semantics demand that no load must be re-ordered * from after a lock acquisition to before, for obvious * reasons. */ LWLOCK_IS_SHARED(mustwait, lock, lockid); if (!mustwait) { /* Step (3a). Set exclusive flag. This surely happens * after (2) because it depends on the result of (2), * no matter how much reordering is going on here. */ lock->exclusive++; } } else mustwait = true; } /* If retrying, allow LWLockRelease to release waiters again. * This is also separately done in the LW_SHARED early exit case * above, and in contrast to there we don't hold the mutex there. * See the comment there for why this is safe */ if (retry) lock->releaseOK = true; if (!mustwait) break; /* got the lock */ /* * Step (3b). Add myself to wait queue. * * If we don't have a PGPROC structure, there's no way to wait. This * should never occur, since MyProc should only be null during shared * memory initialization. */ if (proc == NULL) elog(PANIC, "cannot wait without a PGPROC structure"); proc->lwWaiting = true; proc->lwExclusive = (mode == LW_EXCLUSIVE); proc->lwWaitLink = NULL; if (lock->head == NULL) lock->head = proc; else lock->tail->lwWaitLink = proc; lock->tail = proc; /* Can release the mutex now */ SpinLockRelease(&lock->mutex); /* * Wait until awakened. * * Since we share the process wait semaphore with the regular lock * manager and ProcWaitForSignal, and we may need to acquire an LWLock * while one of those is pending, it is possible that we get awakened * for a reason other than being signaled by LWLockRelease. If so, * loop back and wait again. Once we've gotten the LWLock, * re-increment the sema by the number of additional signals received, * so that the lock manager or signal manager will see the received * signal when it next waits. */ LOG_LWDEBUG("LWLockAcquire", lockid, "waiting"); #ifdef LWLOCK_STATS block_counts[lockid]++; #endif TRACE_POSTGRESQL_LWLOCK_WAIT_START(lockid, mode); for (;;) { /* "false" means cannot accept cancel/die interrupt here. */ PGSemaphoreLock(&proc->sem, false); if (!proc->lwWaiting) break; extraWaits++; } TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(lockid, mode); LOG_LWDEBUG("LWLockAcquire", lockid, "awakened"); /* Now loop back and try to acquire lock again. */ retry = true; } /* We are done updating shared state of the lock itself. */ SpinLockRelease(&lock->mutex); /* Step 4. Enter protected section. This surely happens after (3), * this time because lock release semantics demand that no store * must be moved from before a lock release to after the release, * again for obvious reasons */ #ifdef LWLOCK_PART_SHARED_OPS_ATOMIC lock_acquired: #endif TRACE_POSTGRESQL_LWLOCK_ACQUIRE(lockid, mode); /* Add lock to list of locks held by this backend */ held_lwlocks[num_held_lwlocks] = lockid; held_lwlocks_mode[num_held_lwlocks] = mode; ++num_held_lwlocks; /* * Fix the process wait semaphore's count for any absorbed wakeups. */ while (extraWaits-- > 0) PGSemaphoreUnlock(&proc->sem); }
/* * LWLockWaitForVar - Wait until lock is free, or a variable is updated. * * If the lock is held and *valptr equals oldval, waits until the lock is * either freed, or the lock holder updates *valptr by calling * LWLockUpdateVar. If the lock is free on exit (immediately or after * waiting), returns true. If the lock is still held, but *valptr no longer * matches oldval, returns false and sets *newval to the current value in * *valptr. * * It's possible that the lock holder releases the lock, but another backend * acquires it again before we get a chance to observe that the lock was * momentarily released. We wouldn't need to wait for the new lock holder, * but we cannot distinguish that case, so we will have to wait. * * Note: this function ignores shared lock holders; if the lock is held * in shared mode, returns 'true'. */ bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) { PGPROC *proc = MyProc; int extraWaits = 0; bool result = false; #ifdef LWLOCK_STATS lwlock_stats *lwstats; #endif PRINT_LWDEBUG("LWLockWaitForVar", lock); #ifdef LWLOCK_STATS lwstats = get_lwlock_stats_entry(lock); #endif /* LWLOCK_STATS */ /* * Quick test first to see if it the slot is free right now. * * XXX: the caller uses a spinlock before this, so we don't need a memory * barrier here as far as the current usage is concerned. But that might * not be safe in general. */ if (lock->exclusive == 0) return true; /* * Lock out cancel/die interrupts while we sleep on the lock. There is no * cleanup mechanism to remove us from the wait queue if we got * interrupted. */ HOLD_INTERRUPTS(); /* * Loop here to check the lock's status after each time we are signaled. */ for (;;) { bool mustwait; uint64 value; /* Acquire mutex. Time spent holding mutex should be short! */ #ifdef LWLOCK_STATS lwstats->spin_delay_count += SpinLockAcquire(&lock->mutex); #else SpinLockAcquire(&lock->mutex); #endif /* Is the lock now free, and if not, does the value match? */ if (lock->exclusive == 0) { result = true; mustwait = false; } else { value = *valptr; if (value != oldval) { result = false; mustwait = false; *newval = value; } else mustwait = true; } if (!mustwait) break; /* the lock was free or value didn't match */ /* * Add myself to wait queue. */ proc->lwWaiting = true; proc->lwWaitMode = LW_WAIT_UNTIL_FREE; /* waiters are added to the front of the queue */ proc->lwWaitLink = lock->head; if (lock->head == NULL) lock->tail = proc; lock->head = proc; /* * Set releaseOK, to make sure we get woken up as soon as the lock is * released. */ lock->releaseOK = true; /* Can release the mutex now */ SpinLockRelease(&lock->mutex); /* * Wait until awakened. * * Since we share the process wait semaphore with the regular lock * manager and ProcWaitForSignal, and we may need to acquire an LWLock * while one of those is pending, it is possible that we get awakened * for a reason other than being signaled by LWLockRelease. If so, * loop back and wait again. Once we've gotten the LWLock, * re-increment the sema by the number of additional signals received, * so that the lock manager or signal manager will see the received * signal when it next waits. */ LOG_LWDEBUG("LWLockWaitForVar", T_NAME(lock), T_ID(lock), "waiting"); #ifdef LWLOCK_STATS lwstats->block_count++; #endif TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), T_ID(lock), LW_EXCLUSIVE); for (;;) { /* "false" means cannot accept cancel/die interrupt here. */ PGSemaphoreLock(&proc->sem, false); if (!proc->lwWaiting) break; extraWaits++; } TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock), LW_EXCLUSIVE); LOG_LWDEBUG("LWLockWaitForVar", T_NAME(lock), T_ID(lock), "awakened"); /* Now loop back and check the status of the lock again. */ } /* We are done updating shared state of the lock itself. */ SpinLockRelease(&lock->mutex); TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), T_ID(lock), LW_EXCLUSIVE); /* * Fix the process wait semaphore's count for any absorbed wakeups. */ while (extraWaits-- > 0) PGSemaphoreUnlock(&proc->sem); /* * Now okay to allow cancel/die interrupts. */ RESUME_INTERRUPTS(); return result; }
/* * CreateShardsWithRoundRobinPolicy creates empty shards for the given table * based on the specified number of initial shards. The function first updates * metadata on the coordinator node to make this shard (and its placements) * visible. Note that the function assumes the table is hash partitioned and * calculates the min/max hash token ranges for each shard, giving them an equal * split of the hash space. Finally, function creates empty shard placements on * worker nodes. */ void CreateShardsWithRoundRobinPolicy(Oid distributedTableId, int32 shardCount, int32 replicationFactor, bool useExclusiveConnections) { char shardStorageType = 0; List *workerNodeList = NIL; int32 workerNodeCount = 0; uint32 placementAttemptCount = 0; uint64 hashTokenIncrement = 0; List *existingShardList = NIL; int64 shardIndex = 0; DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(distributedTableId); bool colocatedShard = false; List *insertedShardPlacements = NIL; /* make sure table is hash partitioned */ CheckHashPartitionedTable(distributedTableId); /* * In contrast to append/range partitioned tables it makes more sense to * require ownership privileges - shards for hash-partitioned tables are * only created once, not continually during ingest as for the other * partitioning types. */ EnsureTableOwner(distributedTableId); /* we plan to add shards: get an exclusive lock on relation oid */ LockRelationOid(distributedTableId, ExclusiveLock); /* validate that shards haven't already been created for this table */ existingShardList = LoadShardList(distributedTableId); if (existingShardList != NIL) { char *tableName = get_rel_name(distributedTableId); ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("table \"%s\" has already had shards created for it", tableName))); } /* make sure that at least one shard is specified */ if (shardCount <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("shard_count must be positive"))); } /* make sure that at least one replica is specified */ if (replicationFactor <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replication_factor must be positive"))); } /* make sure that RF=1 if the table is streaming replicated */ if (cacheEntry->replicationModel == REPLICATION_MODEL_STREAMING && replicationFactor > 1) { char *relationName = get_rel_name(cacheEntry->relationId); ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("using replication factor %d with the streaming " "replication model is not supported", replicationFactor), errdetail("The table %s is marked as streaming replicated and " "the shard replication factor of streaming replicated " "tables must be 1.", relationName), errhint("Use replication factor 1."))); } /* calculate the split of the hash space */ hashTokenIncrement = HASH_TOKEN_COUNT / shardCount; /* don't allow concurrent node list changes that require an exclusive lock */ LockRelationOid(DistNodeRelationId(), RowShareLock); /* load and sort the worker node list for deterministic placement */ workerNodeList = ActivePrimaryNodeList(); workerNodeList = SortList(workerNodeList, CompareWorkerNodes); /* make sure we don't process cancel signals until all shards are created */ HOLD_INTERRUPTS(); workerNodeCount = list_length(workerNodeList); if (replicationFactor > workerNodeCount) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replication_factor (%d) exceeds number of worker nodes " "(%d)", replicationFactor, workerNodeCount), errhint("Add more worker nodes or try again with a lower " "replication factor."))); } /* if we have enough nodes, add an extra placement attempt for backup */ placementAttemptCount = (uint32) replicationFactor; if (workerNodeCount > replicationFactor) { placementAttemptCount++; } /* set shard storage type according to relation type */ shardStorageType = ShardStorageType(distributedTableId); for (shardIndex = 0; shardIndex < shardCount; shardIndex++) { uint32 roundRobinNodeIndex = shardIndex % workerNodeCount; /* initialize the hash token space for this shard */ text *minHashTokenText = NULL; text *maxHashTokenText = NULL; int32 shardMinHashToken = INT32_MIN + (shardIndex * hashTokenIncrement); int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1); uint64 shardId = GetNextShardId(); List *currentInsertedShardPlacements = NIL; /* if we are at the last shard, make sure the max token value is INT_MAX */ if (shardIndex == (shardCount - 1)) { shardMaxHashToken = INT32_MAX; } /* insert the shard metadata row along with its min/max values */ minHashTokenText = IntegerToText(shardMinHashToken); maxHashTokenText = IntegerToText(shardMaxHashToken); /* * Grabbing the shard metadata lock isn't technically necessary since * we already hold an exclusive lock on the partition table, but we'll * acquire it for the sake of completeness. As we're adding new active * placements, the mode must be exclusive. */ LockShardDistributionMetadata(shardId, ExclusiveLock); InsertShardRow(distributedTableId, shardId, shardStorageType, minHashTokenText, maxHashTokenText); currentInsertedShardPlacements = InsertShardPlacementRows(distributedTableId, shardId, workerNodeList, roundRobinNodeIndex, replicationFactor); insertedShardPlacements = list_concat(insertedShardPlacements, currentInsertedShardPlacements); } CreateShardsOnWorkers(distributedTableId, insertedShardPlacements, useExclusiveConnections, colocatedShard); if (QueryCancelPending) { ereport(WARNING, (errmsg("cancel requests are ignored during shard creation"))); QueryCancelPending = false; } RESUME_INTERRUPTS(); }
/* -------------------------------- * internal_flush - flush pending output * * Returns 0 if OK (meaning everything was sent, or operation would block * and the socket is in non-blocking mode), or EOF if trouble. * -------------------------------- */ static int internal_flush(void) { static int last_reported_send_errno = 0; char *bufptr = PqSendBuffer + PqSendStart; char *bufend = PqSendBuffer + PqSendPointer; while (bufptr < bufend) { int r; r = secure_write(MyProcPort, bufptr, bufend - bufptr); if (r <= 0) { if (errno == EINTR) continue; /* Ok if we were interrupted */ /* * Ok if no data writable without blocking, and the socket is in * non-blocking mode. */ if (errno == EAGAIN || errno == EWOULDBLOCK) { return 0; } /* * Careful: an ereport() that tries to write to the client would * cause recursion to here, leading to stack overflow and core * dump! This message must go *only* to the postmaster log. * * If a client disconnects while we're in the midst of output, we * might write quite a bit of data before we get to a safe query * abort point. So, suppress duplicate log messages. */ if (errno != last_reported_send_errno) { last_reported_send_errno = errno; /* TDOO: what's this? */ HOLD_INTERRUPTS(); /* we can use ereport here, for the protection of send mutex */ ereport(COMMERROR, (errcode_for_socket_access(), errmsg("could not send data to client: %m"))); RESUME_INTERRUPTS(); } /* * We drop the buffered data anyway so that processing can * continue, even though we'll probably quit soon. We also set a * flag that'll cause the next CHECK_FOR_INTERRUPTS to terminate * the connection. */ PqSendStart = PqSendPointer = 0; ClientConnectionLost = 1; InterruptPending = 1; return EOF; } last_reported_send_errno = 0; /* reset after any successful send */ bufptr += r; PqSendStart += r; } PqSendStart = PqSendPointer = 0; return 0; }
/* * Signal handler for SIGALRM * * Process any active timeout reasons and then reschedule the interrupt * as needed. */ static void handle_sig_alarm(SIGNAL_ARGS) { int save_errno = errno; bool save_ImmediateInterruptOK = ImmediateInterruptOK; /* * We may be executing while ImmediateInterruptOK is true (e.g., when * mainline is waiting for a lock). If SIGINT or similar arrives while * this code is running, we'd lose control and perhaps leave our data * structures in an inconsistent state. Disable immediate interrupts, and * just to be real sure, bump the holdoff counter as well. (The reason * for this belt-and-suspenders-too approach is to make sure that nothing * bad happens if a timeout handler calls code that manipulates * ImmediateInterruptOK.) * * Note: it's possible for a SIGINT to interrupt handle_sig_alarm before * we manage to do this; the net effect would be as if the SIGALRM event * had been silently lost. Therefore error recovery must include some * action that will allow any lost interrupt to be rescheduled. Disabling * some or all timeouts is sufficient, or if that's not appropriate, * reschedule_timeouts() can be called. Also, the signal blocking hazard * described below applies here too. */ ImmediateInterruptOK = false; HOLD_INTERRUPTS(); /* * SIGALRM is always cause for waking anything waiting on the process * latch. Cope with MyProc not being there, as the startup process also * uses this signal handler. */ if (MyProc) SetLatch(&MyProc->procLatch); /* * Fire any pending timeouts, but only if we're enabled to do so. */ if (alarm_enabled) { /* * Disable alarms, just in case this platform allows signal handlers * to interrupt themselves. schedule_alarm() will re-enable if * appropriate. */ disable_alarm(); if (num_active_timeouts > 0) { TimestampTz now = GetCurrentTimestamp(); /* While the first pending timeout has been reached ... */ while (num_active_timeouts > 0 && now >= active_timeouts[0]->fin_time) { timeout_params *this_timeout = active_timeouts[0]; /* Remove it from the active list */ remove_timeout_index(0); /* Mark it as fired */ this_timeout->indicator = true; /* And call its handler function */ (*this_timeout->timeout_handler) (); /* * The handler might not take negligible time (CheckDeadLock * for instance isn't too cheap), so let's update our idea of * "now" after each one. */ now = GetCurrentTimestamp(); } /* Done firing timeouts, so reschedule next interrupt if any */ schedule_alarm(now); } } /* * Re-allow query cancel, and then try to service any cancel request that * arrived meanwhile (this might in particular include a cancel request * fired by one of the timeout handlers). Since we are in a signal * handler, we mustn't call ProcessInterrupts unless ImmediateInterruptOK * is set; if it isn't, the cancel will happen at the next mainline * CHECK_FOR_INTERRUPTS. * * Note: a longjmp from here is safe so far as our own data structures are * concerned; but on platforms that block a signal before calling the * handler and then un-block it on return, longjmping out of the signal * handler leaves SIGALRM still blocked. Error cleanup is responsible for * unblocking any blocked signals. */ RESUME_INTERRUPTS(); ImmediateInterruptOK = save_ImmediateInterruptOK; if (save_ImmediateInterruptOK) CHECK_FOR_INTERRUPTS(); errno = save_errno; }
/* * Main entry point for bgwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; am_bg_writer = true; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (bgwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * SIGUSR1 is presently unused; keep it spare in case someday we want this * process to participate in ProcSignal signalling. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, SIG_IGN); /* as of 9.2 no longer requests checkpoint */ pqsignal(SIGTERM, ReqShutdownHandler); /* shutdown */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ pqsignal(SIGUSR2, SIG_IGN); /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Use the recovery target timeline ID during recovery */ if (RecoveryInProgress()) ThisTimeLineID = GetRecoveryTargetTLI(); /* * Loop forever */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive()) exit(1); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* update global shmem state for sync rep */ } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Do one cycle of dirty-buffer writing. */ BgBufferSync(); /* Nap for the configured time. */ BgWriterNap(); } }