static void cfs_bgworker_main(Datum arg) { int worker_id = DatumGetInt32(arg); sigset_t sset; signal(SIGINT, cfs_cancel); signal(SIGQUIT, cfs_cancel); signal(SIGTERM, cfs_cancel); sigfillset(&sset); sigprocmask(SIG_UNBLOCK, &sset, NULL); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); elog(INFO, "Start CFS garbage collector %d", MyProcPid); while (cfs_scan_tablespace(worker_id) && !cfs_stop && --cfs_state->max_iterations >= 0) { int rc = WaitLatch(MyLatch, WL_TIMEOUT | WL_POSTMASTER_DEATH, cfs_gc_period /* ms */ ); if (rc & WL_POSTMASTER_DEATH) { exit(1); } } }
/* * hello_main * * Main loop processing. */ void hello_main(Datum main_arg) { /* Set up the sigterm signal before unblocking them */ pqsignal(SIGTERM, hello_sigterm); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); while (!got_sigterm) { int rc; /* Wait 10s */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 10000L, PG_WAIT_EXTENSION); ResetLatch(&MyProc->procLatch); /* Emergency bailout if postmaster has died */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); elog(LOG, "Hello World!"); /* Say Hello to the world */ } proc_exit(0); }
/* * ProcWaitForSignal - wait for a signal from another backend. * * As this uses the generic process latch the caller has to be robust against * unrelated wakeups: Always check that the desired state has occurred, and * wait again if not. */ void ProcWaitForSignal(void) { WaitLatch(MyLatch, WL_LATCH_SET, 0); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); }
/* * Wait until at least *nbytesp bytes are available to be read from the * shared message queue, or until the buffer wraps around. If the queue is * detached, returns SHM_MQ_DETACHED. If nowait is specified and a wait * would be required, returns SHM_MQ_WOULD_BLOCK. Otherwise, *datap is set * to the location at which data bytes can be read, *nbytesp is set to the * number of bytes which can be read at that address, and the return value * is SHM_MQ_SUCCESS. */ static shm_mq_result shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed, bool nowait, Size *nbytesp, void **datap) { Size ringsize = mq->mq_ring_size; uint64 used; uint64 written; for (;;) { Size offset; bool detached; /* Get bytes written, so we can compute what's available to read. */ written = shm_mq_get_bytes_written(mq, &detached); used = written - mq->mq_bytes_read; Assert(used <= ringsize); offset = mq->mq_bytes_read % (uint64) ringsize; /* If we have enough data or buffer has wrapped, we're done. */ if (used >= bytes_needed || offset + used >= ringsize) { *nbytesp = Min(used, ringsize - offset); *datap = &mq->mq_ring[mq->mq_ring_offset + offset]; return SHM_MQ_SUCCESS; } /* * Fall out before waiting if the queue has been detached. * * Note that we don't check for this until *after* considering * whether the data already available is enough, since the * receiver can finish receiving a message stored in the buffer * even after the sender has detached. */ if (detached) return SHM_MQ_DETACHED; /* Skip manipulation of our latch if nowait = true. */ if (nowait) return SHM_MQ_WOULD_BLOCK; /* * Wait for our latch to be set. It might already be set for * some unrelated reason, but that'll just result in one extra * trip through the loop. It's worth it to avoid resetting the * latch at top of loop, because setting an already-set latch is * much cheaper than setting one that has been reset. */ WaitLatch(&MyProc->procLatch, WL_LATCH_SET, 0); /* An interrupt may have occurred while we were waiting. */ CHECK_FOR_INTERRUPTS(); /* Reset the latch so we don't spin. */ ResetLatch(&MyProc->procLatch); } }
static void wait_for_workers_to_become_ready(worker_state *wstate, volatile test_shm_mq_header *hdr) { bool save_set_latch_on_sigusr1; bool result = false; save_set_latch_on_sigusr1 = set_latch_on_sigusr1; set_latch_on_sigusr1 = true; PG_TRY(); { for (;;) { int workers_ready; /* If all the workers are ready, we have succeeded. */ SpinLockAcquire(&hdr->mutex); workers_ready = hdr->workers_ready; SpinLockRelease(&hdr->mutex); if (workers_ready >= wstate->nworkers) { result = true; break; } /* If any workers (or the postmaster) have died, we have failed. */ if (!check_worker_status(wstate)) { result = false; break; } /* Wait to be signalled. */ WaitLatch(MyLatch, WL_LATCH_SET, 0); /* An interrupt may have occurred while we were waiting. */ CHECK_FOR_INTERRUPTS(); /* Reset the latch so we don't spin. */ ResetLatch(MyLatch); } } PG_CATCH(); { set_latch_on_sigusr1 = save_set_latch_on_sigusr1; PG_RE_THROW(); } PG_END_TRY(); if (!result) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("one or more background workers failed to start"))); }
/* * Wait for a background worker to start up and attach to the shmem context. * * This is only needed for cleaning up the shared memory in case the worker * fails to attach. */ static void WaitForReplicationWorkerAttach(LogicalRepWorker *worker, uint16 generation, BackgroundWorkerHandle *handle) { BgwHandleStatus status; int rc; for (;;) { pid_t pid; CHECK_FOR_INTERRUPTS(); LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); /* Worker either died or has started; no need to do anything. */ if (!worker->in_use || worker->proc) { LWLockRelease(LogicalRepWorkerLock); return; } LWLockRelease(LogicalRepWorkerLock); /* Check if worker has died before attaching, and clean up after it. */ status = GetBackgroundWorkerPid(handle, &pid); if (status == BGWH_STOPPED) { LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); /* Ensure that this was indeed the worker we waited for. */ if (generation == worker->generation) logicalrep_worker_cleanup(worker); LWLockRelease(LogicalRepWorkerLock); return; } /* * We need timeout because we generally don't get notified via latch * about the worker attach. But we don't expect to have to wait long. */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, 10L, WAIT_EVENT_BGWORKER_STARTUP); if (rc & WL_LATCH_SET) { ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); } } return; }
/* * pg_sleep - delay for N seconds */ Datum pg_sleep(PG_FUNCTION_ARGS) { float8 secs = PG_GETARG_FLOAT8(0); float8 endtime; /* * We sleep using WaitLatch, to ensure that we'll wake up promptly if an * important signal (such as SIGALRM or SIGINT) arrives. Because * WaitLatch's upper limit of delay is INT_MAX milliseconds, and the user * might ask for more than that, we sleep for at most 10 minutes and then * loop. * * By computing the intended stop time initially, we avoid accumulation of * extra delay across multiple sleeps. This also ensures we won't delay * less than the specified time when WaitLatch is terminated early by a * non-query-canceling signal such as SIGHUP. */ #ifdef HAVE_INT64_TIMESTAMP #define GetNowFloat() ((float8) GetCurrentTimestamp() / 1000000.0) #else #define GetNowFloat() GetCurrentTimestamp() #endif endtime = GetNowFloat() + secs; for (;;) { float8 delay; long delay_ms; CHECK_FOR_INTERRUPTS(); delay = endtime - GetNowFloat(); if (delay >= 600.0) delay_ms = 600000; else if (delay > 0.0) delay_ms = (long) ceil(delay * 1000.0); else break; (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT, delay_ms, WAIT_EVENT_PG_SLEEP); ResetLatch(MyLatch); } PG_RETURN_VOID(); }
static void BufferSaverMain(Datum main_arg) { WorkerCommon(); /* * Main loop: do this until the SIGTERM handler tells us to terminate */ while (!got_sigterm) { int rc; ResetLatch(&MyProc->procLatch); /* * Wait on the process latch, which sleeps as necessary, but is awakened * if postmaster dies. This way the background process goes away * immediately in case of an emergency. */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 10 * 1000L); /* emergency bailout if postmaster has died */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); /* * In case of a SIGHUP, just reload the configuration. */ if (got_sighup) { got_sighup = false; ProcessConfigFile(PGC_SIGHUP); } } /* * We recieved the SIGTERM; Shutdown is in progress, so save the * shared-buffer contents. */ /* Save the buffers only if the extension is enabled. */ if (guc_enabled) SaveBuffers(); /* * The worker exits here. A proc_exit(0) is not necessary, we'll let the * caller do that. */ }
/* * This is used when a process is waiting for its counterpart to attach to the * queue. We exit when the other process attaches as expected, or, if * handle != NULL, when the referenced background process or the postmaster * dies. Note that if handle == NULL, and the process fails to attach, we'll * potentially get stuck here forever waiting for a process that may never * start. We do check for interrupts, though. * * ptr is a pointer to the memory address that we're expecting to become * non-NULL when our counterpart attaches to the queue. */ static bool shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr, BackgroundWorkerHandle *handle) { bool result = false; for (;;) { BgwHandleStatus status; pid_t pid; bool detached; /* Acquire the lock just long enough to check the pointer. */ SpinLockAcquire(&mq->mq_mutex); detached = mq->mq_detached; result = (*ptr != NULL); SpinLockRelease(&mq->mq_mutex); /* Fail if detached; else succeed if initialized. */ if (detached) { result = false; break; } if (result) break; if (handle != NULL) { /* Check for unexpected worker death. */ status = GetBackgroundWorkerPid(handle, &pid); if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED) { result = false; break; } } /* Wait to be signalled. */ WaitLatch(MyLatch, WL_LATCH_SET, 0); /* An interrupt may have occurred while we were waiting. */ CHECK_FOR_INTERRUPTS(); /* Reset the latch so we don't spin. */ ResetLatch(MyLatch); } return result; }
/* * Wait until the apply worker changes the state of our synchronization * worker to the expected one. * * Used when transitioning from SYNCWAIT state to CATCHUP. * * Returns false if the apply worker has disappeared. */ static bool wait_for_worker_state_change(char expected_state) { int rc; for (;;) { LogicalRepWorker *worker; CHECK_FOR_INTERRUPTS(); /* * Done if already in correct state. (We assume this fetch is atomic * enough to not give a misleading answer if we do it with no lock.) */ if (MyLogicalRepWorker->relstate == expected_state) return true; /* * Bail out if the apply worker has died, else signal it we're * waiting. */ LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); worker = logicalrep_worker_find(MyLogicalRepWorker->subid, InvalidOid, false); if (worker && worker->proc) logicalrep_worker_wakeup_ptr(worker); LWLockRelease(LogicalRepWorkerLock); if (!worker) break; /* * Wait. We expect to get a latch signal back from the apply worker, * but use a timeout in case it dies without sending one. */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); if (rc & WL_LATCH_SET) ResetLatch(MyLatch); } return false; }
/* * Wait until the relation synchronization state is set in the catalog to the * expected one. * * Used when transitioning from CATCHUP state to SYNCDONE. * * Returns false if the synchronization worker has disappeared or the table state * has been reset. */ static bool wait_for_relation_state_change(Oid relid, char expected_state) { char state; for (;;) { LogicalRepWorker *worker; XLogRecPtr statelsn; CHECK_FOR_INTERRUPTS(); /* XXX use cache invalidation here to improve performance? */ PushActiveSnapshot(GetLatestSnapshot()); state = GetSubscriptionRelState(MyLogicalRepWorker->subid, relid, &statelsn, true); PopActiveSnapshot(); if (state == SUBREL_STATE_UNKNOWN) return false; if (state == expected_state) return true; /* Check if the sync worker is still running and bail if not. */ LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); /* Check if the opposite worker is still running and bail if not. */ worker = logicalrep_worker_find(MyLogicalRepWorker->subid, am_tablesync_worker() ? InvalidOid : relid, false); LWLockRelease(LogicalRepWorkerLock); if (!worker) return false; (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); ResetLatch(MyLatch); } return false; }
static void config_log_main(Datum main_arg) { config_log_objects *objects; pqsignal(SIGTERM, config_log_sigterm); pqsignal(SIGHUP, config_log_sighup); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); /* Connect to database */ BackgroundWorkerInitializeConnection(config_log_database, NULL); /* Verify expected objects exist */ objects = initialize_objects(); while (!got_sigterm) { int rc; rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 100000L); ResetLatch(&MyProc->procLatch); /* emergency bailout if postmaster has died */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); /* * In case of a SIGHUP, just reload the configuration. */ if (got_sighup) { got_sighup = false; ProcessConfigFile(PGC_SIGHUP); execute_pg_settings_logger(objects); } } proc_exit(0); }
/** * Main loop of the sender process. It wakes up every * gp_perfmon_segment_interval ms to send segment * information to perfmon */ static void SegmentInfoSenderLoop(void) { int rc; int counter; for (counter = 0;; counter += SEGMENT_INFO_LOOP_SLEEP_MS) { CHECK_FOR_INTERRUPTS(); if (senderShutdownRequested) { break; } /* no need to live on if postmaster has died */ if (!PostmasterIsAlive()) exit(1); if (cluster_state_collect_hook) cluster_state_collect_hook(); if (gp_enable_gpperfmon && counter >= gp_perfmon_segment_interval) { SegmentInfoSender(); counter = 0; } /* Sleep a while. */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, SEGMENT_INFO_LOOP_SLEEP_MS); ResetLatch(&MyProc->procLatch); /* emergency bailout if postmaster has died */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); } /* end server loop */ return; }
void hello_main(Datum main_arg) { /* Register functions for SIGTERM/SIGHUP management */ pqsignal(SIGHUP, hello_sighup); pqsignal(SIGTERM, hello_sigterm); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); while (true) { /* Wait 1s */ WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, 1000L, PG_WAIT_EXTENSION); ResetLatch(MyLatch); /* Process signals */ if (got_sighup) { /* Process config file */ ProcessConfigFile(PGC_SIGHUP); got_sighup = false; ereport(LOG, (errmsg("hello signal: processed SIGHUP"))); } if (got_sigterm) { /* Simply exit */ ereport(LOG, (errmsg("hello signal: processed SIGTERM"))); proc_exit(0); } } /* No problems, so clean exit */ proc_exit(0); }
/* * pgarch_MainLoop * * Main loop for archiver */ static void pgarch_MainLoop(void) { pg_time_t last_copy_time = 0; bool time_to_stop; /* * We run the copy loop immediately upon entry, in case there are * unarchived files left over from a previous database run (or maybe the * archiver died unexpectedly). After that we wait for a signal or * timeout before doing more. */ wakened = true; /* * There shouldn't be anything for the archiver to do except to wait * for a signal ... however, the archiver exists to protect our data, * so she wakes up occasionally to allow herself to be proactive. */ do { ResetLatch(&mainloop_latch); /* When we get SIGUSR2, we do one more archive cycle, then exit */ time_to_stop = ready_to_stop; /* Check for config update */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } /* * If we've gotten SIGTERM, we normally just sit and do nothing until * SIGUSR2 arrives. However, that means a random SIGTERM would * disable archiving indefinitely, which doesn't seem like a good * idea. If more than 60 seconds pass since SIGTERM, exit anyway, so * that the postmaster can start a new archiver if needed. */ if (got_SIGTERM) { time_t curtime = time(NULL); if (last_sigterm_time == 0) last_sigterm_time = curtime; else if ((unsigned int) (curtime - last_sigterm_time) >= (unsigned int) 60) break; } /* Do what we're here for */ if (wakened || time_to_stop) { wakened = false; pgarch_ArchiverCopyLoop(); last_copy_time = time(NULL); } /* * Sleep until a signal is received, or until a poll is forced by * PGARCH_AUTOWAKE_INTERVAL having passed since last_copy_time, or * until postmaster dies. */ if (!time_to_stop) /* Don't wait during last iteration */ { pg_time_t curtime = (pg_time_t) time(NULL); int timeout; timeout = PGARCH_AUTOWAKE_INTERVAL - (curtime - last_copy_time); if (timeout > 0) { int rc; rc = WaitLatch(&mainloop_latch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, timeout * 1000000L); if (rc & WL_TIMEOUT) wakened = true; } else wakened = true; } /* * The archiver quits either when the postmaster dies (not expected) * or after completing one more archiving cycle after receiving * SIGUSR2. */ } while (PostmasterIsAlive() && !time_to_stop); }
/* * Stop the logical replication worker for subid/relid, if any, and wait until * it detaches from the slot. */ void logicalrep_worker_stop(Oid subid, Oid relid) { LogicalRepWorker *worker; uint16 generation; LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); worker = logicalrep_worker_find(subid, relid, false); /* No worker, nothing to do. */ if (!worker) { LWLockRelease(LogicalRepWorkerLock); return; } /* * Remember which generation was our worker so we can check if what we see * is still the same one. */ generation = worker->generation; /* * If we found a worker but it does not have proc set then it is still * starting up; wait for it to finish starting and then kill it. */ while (worker->in_use && !worker->proc) { int rc; LWLockRelease(LogicalRepWorkerLock); /* Wait a bit --- we don't expect to have to wait long. */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, 10L, WAIT_EVENT_BGWORKER_STARTUP); if (rc & WL_LATCH_SET) { ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); } /* Recheck worker status. */ LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); /* * Check whether the worker slot is no longer used, which would mean * that the worker has exited, or whether the worker generation is * different, meaning that a different worker has taken the slot. */ if (!worker->in_use || worker->generation != generation) { LWLockRelease(LogicalRepWorkerLock); return; } /* Worker has assigned proc, so it has started. */ if (worker->proc) break; } /* Now terminate the worker ... */ kill(worker->proc->pid, SIGTERM); /* ... and wait for it to die. */ for (;;) { int rc; /* is it gone? */ if (!worker->proc || worker->generation != generation) break; LWLockRelease(LogicalRepWorkerLock); /* Wait a bit --- we don't expect to have to wait long. */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, 10L, WAIT_EVENT_BGWORKER_SHUTDOWN); if (rc & WL_LATCH_SET) { ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); } LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); } LWLockRelease(LogicalRepWorkerLock); }
/* * Transmit a libpq protocol message to the shared memory message queue * selected via pq_mq_handle. We don't include a length word, because the * receiver will know the length of the message from shm_mq_receive(). */ static int mq_putmessage(char msgtype, const char *s, size_t len) { shm_mq_iovec iov[2]; shm_mq_result result; /* * If we're sending a message, and we have to wait because the queue is * full, and then we get interrupted, and that interrupt results in trying * to send another message, we respond by detaching the queue. There's no * way to return to the original context, but even if there were, just * queueing the message would amount to indefinitely postponing the * response to the interrupt. So we do this instead. */ if (pq_mq_busy) { if (pq_mq != NULL) shm_mq_detach(pq_mq); pq_mq = NULL; pq_mq_handle = NULL; return EOF; } /* * If the message queue is already gone, just ignore the message. This * doesn't necessarily indicate a problem; for example, DEBUG messages * can be generated late in the shutdown sequence, after all DSMs have * already been detached. */ if (pq_mq == NULL) return 0; pq_mq_busy = true; iov[0].data = &msgtype; iov[0].len = 1; iov[1].data = s; iov[1].len = len; Assert(pq_mq_handle != NULL); for (;;) { result = shm_mq_sendv(pq_mq_handle, iov, 2, true); if (pq_mq_parallel_master_pid != 0) SendProcSignal(pq_mq_parallel_master_pid, PROCSIG_PARALLEL_MESSAGE, pq_mq_parallel_master_backend_id); if (result != SHM_MQ_WOULD_BLOCK) break; WaitLatch(&MyProc->procLatch, WL_LATCH_SET, 0); CHECK_FOR_INTERRUPTS(); ResetLatch(&MyProc->procLatch); } pq_mq_busy = false; Assert(result == SHM_MQ_SUCCESS || result == SHM_MQ_DETACHED); if (result != SHM_MQ_SUCCESS) return EOF; return 0; }
/* * worker logic */ void wed_worker_main(Datum main_arg) { StringInfoData buf; /* Establish signal handlers before unblocking signals. */ pqsignal(SIGHUP, wed_worker_sighup); pqsignal(SIGTERM, wed_worker_sigterm); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); /* Connect to our database */ BackgroundWorkerInitializeConnection(wed_worker_db_name, NULL); elog(LOG, "%s initialized in: %s", MyBgworkerEntry->bgw_name, wed_worker_db_name); initStringInfo(&buf); appendStringInfo(&buf, "SELECT trcheck()"); /* * Main loop: do this until the SIGTERM handler tells us to terminate */ while (!got_sigterm) { int ret; int rc; /* * Background workers mustn't call usleep() or any direct equivalent: * instead, they may wait on their process latch, which sleeps as * necessary, but is awakened if postmaster dies. That way the * background process goes away immediately in an emergency. */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, wed_worker_naptime * 1000L); ResetLatch(&MyProc->procLatch); /* emergency bailout if postmaster has died */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); /* * In case of a SIGHUP, just reload the configuration. */ if (got_sighup) { got_sighup = false; ProcessConfigFile(PGC_SIGHUP); } /* * Start a transaction on which we can run queries. Note that each * StartTransactionCommand() call should be preceded by a * SetCurrentStatementStartTimestamp() call, which sets both the time * for the statement we're about the run, and also the transaction * start time. Also, each other query sent to SPI should probably be * preceded by SetCurrentStatementStartTimestamp(), so that statement * start time is always up to date. * * The SPI_connect() call lets us run queries through the SPI manager, * and the PushActiveSnapshot() call creates an "active" snapshot * which is necessary for queries to have MVCC data to work on. * * The pgstat_report_activity() call makes our activity visible * through the pgstat views. */ SetCurrentStatementStartTimestamp(); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); pgstat_report_activity(STATE_RUNNING, buf.data); /* We can now execute queries via SPI */ ret = SPI_execute(buf.data, false, 0); if (ret != SPI_OK_SELECT) elog(FATAL, "stored procedure trcheck() not found: error code %d", ret); elog(LOG, "%s : trcheck() done !", MyBgworkerEntry->bgw_name); /* * And finish our transaction. */ SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); pgstat_report_activity(STATE_IDLE, NULL); } proc_exit(1); }
/* * Perform garbage collection (if required) of file * @param map_path path to file map file (*.map). */ static bool cfs_gc_file(char* map_path) { int md = open(map_path, O_RDWR|PG_BINARY, 0); FileMap* map; uint32 physSize; uint32 usedSize; uint32 virtSize; int suf = strlen(map_path)-4; int fd = -1, fd2 = -1, md2 = -1; bool succeed = true; if (md < 0) { elog(LOG, "Failed to open map file %s: %m", map_path); return false; } map = cfs_mmap(md); if (map == MAP_FAILED) { elog(LOG, "Failed to map file %s: %m", map_path); close(md); return false; } usedSize = pg_atomic_read_u32(&map->usedSize); physSize = pg_atomic_read_u32(&map->physSize); virtSize = pg_atomic_read_u32(&map->virtSize); if ((physSize - usedSize)*100 > physSize*cfs_gc_threshold) /* do we need to perform defragmentation? */ { long delay = CFS_LOCK_MIN_TIMEOUT; char* file_path = (char*)palloc(suf+1); char* map_bck_path = (char*)palloc(suf+10); char* file_bck_path = (char*)palloc(suf+5); FileMap* newMap = (FileMap*)palloc0(sizeof(FileMap)); uint32 newSize = 0; inode_t** inodes = (inode_t**)palloc(RELSEG_SIZE*sizeof(inode_t*)); bool remove_backups = true; int n_pages = virtSize / BLCKSZ; TimestampTz startTime, endTime; long secs; int usecs; int i; startTime = GetCurrentTimestamp(); memcpy(file_path, map_path, suf); file_path[suf] = '\0'; strcat(strcpy(map_bck_path, map_path), ".bck"); strcat(strcpy(file_bck_path, file_path), ".bck"); while (true) { uint32 access_count = 0; if (pg_atomic_compare_exchange_u32(&map->lock, &access_count, CFS_GC_LOCK)) { break; } if (access_count >= CFS_GC_LOCK) { /* Uhhh... looks like last GC was interrupted. * Try to recover file */ if (access(file_bck_path, R_OK) != 0) { /* There is no backup file: new map should be constructed */ md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0); if (md2 >= 0) { /* Recover map */ if (!cfs_read_file(md2, newMap, sizeof(FileMap))) { elog(LOG, "Failed to read file %s: %m", map_bck_path); goto Cleanup; } close(md2); md2 = -1; newSize = pg_atomic_read_u32(&newMap->usedSize); remove_backups = false; goto ReplaceMap; } } else { /* Presence of backup file means that we still have unchanged data and map files. * Just remove backup files, grab lock and continue processing */ unlink(file_bck_path); unlink(map_bck_path); break; } } pg_usleep(delay); if (delay < CFS_LOCK_MAX_TIMEOUT) { delay *= 2; } } md2 = open(map_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600); if (md2 < 0) { goto Cleanup; } for (i = 0; i < n_pages; i++) { newMap->inodes[i] = map->inodes[i]; inodes[i] = &newMap->inodes[i]; } /* sort inodes by offset to improve read locality */ qsort(inodes, n_pages, sizeof(inode_t*), cfs_cmp_page_offs); fd = open(file_path, O_RDWR|PG_BINARY, 0); if (fd < 0) { goto Cleanup; } fd2 = open(file_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600); if (fd2 < 0) { goto Cleanup; } for (i = 0; i < n_pages; i++) { int size = CFS_INODE_SIZE(*inodes[i]); if (size != 0) { char block[BLCKSZ]; off_t rc PG_USED_FOR_ASSERTS_ONLY; uint32 offs = CFS_INODE_OFFS(*inodes[i]); Assert(size <= BLCKSZ); rc = lseek(fd, offs, SEEK_SET); Assert(rc == offs); if (!cfs_read_file(fd, block, size)) { elog(LOG, "Failed to read file %s: %m", file_path); goto Cleanup; } if (!cfs_write_file(fd2, block, size)) { elog(LOG, "Failed to write file %s: %m", file_bck_path); goto Cleanup; } offs = newSize; newSize += size; *inodes[i] = CFS_INODE(size, offs); } } pg_atomic_write_u32(&map->usedSize, newSize); if (close(fd) < 0) { elog(LOG, "Failed to close file %s: %m", file_path); goto Cleanup; } fd = -1; /* Persist copy of data file */ if (pg_fsync(fd2) < 0) { elog(LOG, "Failed to sync file %s: %m", file_bck_path); goto Cleanup; } if (close(fd2) < 0) { elog(LOG, "Failed to close file %s: %m", file_bck_path); goto Cleanup; } fd2 = -1; /* Persist copy of map file */ if (!cfs_write_file(md2, &newMap, sizeof(newMap))) { elog(LOG, "Failed to write file %s: %m", map_bck_path); goto Cleanup; } if (pg_fsync(md2) < 0) { elog(LOG, "Failed to sync file %s: %m", map_bck_path); goto Cleanup; } if (close(md2) < 0) { elog(LOG, "Failed to close file %s: %m", map_bck_path); goto Cleanup; } md2 = -1; /* Persist map with CFS_GC_LOCK set: in case of crash we will know that map may be changed by GC */ if (cfs_msync(map) < 0) { elog(LOG, "Failed to sync map %s: %m", map_path); goto Cleanup; } if (pg_fsync(md) < 0) { elog(LOG, "Failed to sync file %s: %m", map_path); goto Cleanup; } /* * Now all information necessary for recovery is stored. * We are ready to replace existed file with defragmented one. * Use rename and rely on file system to provide atomicity of this operation. */ remove_backups = false; if (rename(file_bck_path, file_path) < 0) { elog(LOG, "Failed to rename file %s: %m", file_path); goto Cleanup; } ReplaceMap: /* At this moment defragmented file version is stored. We can perfrom in-place update of map. * If crash happens at this point, map can be recovered from backup file */ memcpy(map->inodes, newMap->inodes, n_pages * sizeof(inode_t)); pg_atomic_write_u32(&map->usedSize, newSize); pg_atomic_write_u32(&map->physSize, newSize); map->generation += 1; /* force all backends to reopen the file */ /* Before removing backup files and releasing locks we need to flush updated map file */ if (cfs_msync(map) < 0) { elog(LOG, "Failed to sync map %s: %m", map_path); goto Cleanup; } if (pg_fsync(md) < 0) { elog(LOG, "Failed to sync file %s: %m", map_path); Cleanup: if (fd >= 0) close(fd); if (fd2 >= 0) close(fd2); if (md2 >= 0) close(md2); if (remove_backups) { unlink(file_bck_path); unlink(map_bck_path); remove_backups = false; } succeed = false; } else { remove_backups = true; /* now backups are not need any more */ } pg_atomic_fetch_sub_u32(&map->lock, CFS_GC_LOCK); /* release lock */ /* remove map backup file */ if (remove_backups && unlink(map_bck_path)) { elog(LOG, "Failed to unlink file %s: %m", map_bck_path); succeed = false; } endTime = GetCurrentTimestamp(); TimestampDifference(startTime, endTime, &secs, &usecs); elog(LOG, "%d: defragment file %s: old size %d, new size %d, logical size %d, used %d, compression ratio %f, time %ld usec", MyProcPid, file_path, physSize, newSize, virtSize, usedSize, (double)virtSize/newSize, secs*USECS_PER_SEC + usecs); pfree(file_path); pfree(file_bck_path); pfree(map_bck_path); pfree(inodes); pfree(newMap); if (cfs_gc_delay != 0) { int rc = WaitLatch(MyLatch, WL_TIMEOUT | WL_POSTMASTER_DEATH, cfs_gc_delay /* ms */ ); if (rc & WL_POSTMASTER_DEATH) { exit(1); } } } else if (cfs_state->max_iterations == 1) { elog(LOG, "%d: file %.*s: physical size %d, logical size %d, used %d, compression ratio %f", MyProcPid, suf, map_path, physSize, virtSize, usedSize, (double)virtSize/physSize); } if (cfs_munmap(map) < 0) { elog(LOG, "Failed to unmap file %s: %m", map_path); succeed = false; } if (close(md) < 0) { elog(LOG, "Failed to close file %s: %m", map_path); succeed = false; } return succeed; }
/* * Pipelined test of the shared memory message queue infrastructure. * * As in the basic test, we set up a ring of message queues passing through * 1 or more background processes and eventually looping back to ourselves. * Then, we send N copies of the user-specified message through the ring and * receive them all back. Since this might fill up all message queues in the * ring and then stall, we must be prepared to begin receiving the messages * back before we've finished sending them. */ Datum test_shm_mq_pipelined(PG_FUNCTION_ARGS) { int64 queue_size = PG_GETARG_INT64(0); text *message = PG_GETARG_TEXT_PP(1); char *message_contents = VARDATA_ANY(message); int message_size = VARSIZE_ANY_EXHDR(message); int32 loop_count = PG_GETARG_INT32(2); int32 nworkers = PG_GETARG_INT32(3); bool verify = PG_GETARG_BOOL(4); int32 send_count = 0; int32 receive_count = 0; dsm_segment *seg; shm_mq_handle *outqh; shm_mq_handle *inqh; shm_mq_result res; Size len; void *data; /* A negative loopcount is nonsensical. */ if (loop_count < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("repeat count size must be a non-negative integer"))); /* * Using the nonblocking interfaces, we can even send data to ourselves, * so the minimum number of workers for this test is zero. */ if (nworkers < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("number of workers must be a non-negative integer"))); /* Set up dynamic shared memory segment and background workers. */ test_shm_mq_setup(queue_size, nworkers, &seg, &outqh, &inqh); /* Main loop. */ for (;;) { bool wait = true; /* * If we haven't yet sent the message the requisite number of times, * try again to send it now. Note that when shm_mq_send() returns * SHM_MQ_WOULD_BLOCK, the next call to that function must pass the * same message size and contents; that's not an issue here because * we're sending the same message every time. */ if (send_count < loop_count) { res = shm_mq_send(outqh, message_size, message_contents, true); if (res == SHM_MQ_SUCCESS) { ++send_count; wait = false; } else if (res == SHM_MQ_DETACHED) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not send message"))); } /* * If we haven't yet received the message the requisite number of * times, try to receive it again now. */ if (receive_count < loop_count) { res = shm_mq_receive(inqh, &len, &data, true); if (res == SHM_MQ_SUCCESS) { ++receive_count; /* Verifying every time is slow, so it's optional. */ if (verify) verify_message(message_size, message_contents, len, data); wait = false; } else if (res == SHM_MQ_DETACHED) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not receive message"))); } else { /* * Otherwise, we've received the message enough times. This * shouldn't happen unless we've also sent it enough times. */ if (send_count != receive_count) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("message sent %d times, but received %d times", send_count, receive_count))); break; } if (wait) { /* * If we made no progress, wait for one of the other processes to * which we are connected to set our latch, indicating that they * have read or written data and therefore there may now be work * for us to do. */ WaitLatch(MyLatch, WL_LATCH_SET, 0, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); } } /* Clean up. */ dsm_detach(seg); PG_RETURN_VOID(); }
/* * Main entry point for bgwriter process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; bool prev_hibernate; /* * Properly accept or ignore signals the postmaster might send us. * * bgwriter doesn't participate in ProcSignal signalling, but a SIGUSR1 * handler is still needed for latch wakeups. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, ReqShutdownHandler); /* shutdown */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, bgwriter_sigusr1_handler); pqsignal(SIGUSR2, SIG_IGN); /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * We just started, assume there has been either a shutdown or * end-of-recovery snapshot. */ last_snapshot_ts = GetCurrentTimestamp(); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); /* Report wait end here, when there is no further possibility of wait */ pgstat_report_wait_end(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Reset hibernation state after any error. */ prev_hibernate = false; /* * Loop forever */ for (;;) { bool can_hibernate; int rc; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Do one cycle of dirty-buffer writing. */ can_hibernate = BgBufferSync(); /* * Send off activity statistics to the stats collector */ pgstat_send_bgwriter(); if (FirstCallSinceLastCheckpoint()) { /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); } /* * Log a new xl_running_xacts every now and then so replication can * get into a consistent state faster (think of suboverflowed * snapshots) and clean up resources (locks, KnownXids*) more * frequently. The costs of this are relatively low, so doing it 4 * times (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine. * * We assume the interval for writing xl_running_xacts is * significantly bigger than BgWriterDelay, so we don't complicate the * overall timeout handling but just assume we're going to get called * often enough even if hibernation mode is active. It's not that * important that log_snap_interval_ms is met strictly. To make sure * we're not waking the disk up unnecessarily on an idle system we * check whether there has been any WAL inserted since the last time * we've logged a running xacts. * * We do this logging in the bgwriter as its the only process that is * run regularly and returns to its mainloop all the time. E.g. * Checkpointer, when active, is barely ever in its mainloop and thus * makes it hard to log regularly. */ if (XLogStandbyInfoActive() && !RecoveryInProgress()) { TimestampTz timeout = 0; TimestampTz now = GetCurrentTimestamp(); timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, LOG_SNAPSHOT_INTERVAL_MS); /* * only log if enough time has passed and some xlog record has * been inserted. */ if (now >= timeout && last_snapshot_lsn != GetXLogInsertRecPtr()) { last_snapshot_lsn = LogStandbySnapshot(); last_snapshot_ts = now; } } /* * Sleep until we are signaled or BgWriterDelay has elapsed. * * Note: the feedback control loop in BgBufferSync() expects that we * will call it every BgWriterDelay msec. While it's not critical for * correctness that that be exact, the feedback loop might misbehave * if we stray too far from that. Hence, avoid loading this process * down with latch events that are likely to happen frequently during * normal operation. */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay /* ms */ ); /* * If no latch event and BgBufferSync says nothing's happening, extend * the sleep in "hibernation" mode, where we sleep for much longer * than bgwriter_delay says. Fewer wakeups save electricity. When a * backend starts using buffers again, it will wake us up by setting * our latch. Because the extra sleep will persist only as long as no * buffer allocations happen, this should not distort the behavior of * BgBufferSync's control loop too badly; essentially, it will think * that the system-wide idle interval didn't exist. * * There is a race condition here, in that a backend might allocate a * buffer between the time BgBufferSync saw the alloc count as zero * and the time we call StrategyNotifyBgWriter. While it's not * critical that we not hibernate anyway, we try to reduce the odds of * that by only hibernating when BgBufferSync says nothing's happening * for two consecutive cycles. Also, we mitigate any possible * consequences of a missed wakeup by not hibernating forever. */ if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate) { /* Ask for notification at next buffer allocation */ StrategyNotifyBgWriter(MyProc->pgprocno); /* Sleep ... */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay * HIBERNATE_FACTOR); /* Reset the notification request in case we timed out */ StrategyNotifyBgWriter(-1); } /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); prev_hibernate = can_hibernate; } }
/* * Main entry point for checkpointer process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void CheckpointerMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext checkpointer_context; CheckpointerShmem->checkpointer_pid = MyProcPid; /* * Properly accept or ignore signals the postmaster might send us * * Note: we deliberately ignore SIGTERM, because during a standard Unix * system shutdown cycle, init will SIGTERM all processes at once. We * want to wait for the backends to exit, whereupon the postmaster will * tell us it's okay to shut down (via SIGUSR2). */ pqsignal(SIGHUP, ChkptSigHupHandler); /* set flag to read config * file */ pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, chkpt_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, chkpt_sigusr1_handler); pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Initialize so that first time-driven event happens at the correct time. */ last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Checkpointer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ checkpointer_context = AllocSetContextCreate(TopMemoryContext, "Checkpointer", ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(checkpointer_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in checkpointer, but we do have LWLocks, buffers, and temp * files. */ LWLockReleaseAll(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* Warn any waiting backends that the checkpoint failed. */ if (ckpt_active) { SpinLockAcquire(&CheckpointerShmem->ckpt_lck); CheckpointerShmem->ckpt_failed++; CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; SpinLockRelease(&CheckpointerShmem->ckpt_lck); ckpt_active = false; } /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(checkpointer_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(checkpointer_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Ensure all shared memory values are set correctly for the config. Doing * this here ensures no race conditions from other concurrent updaters. */ UpdateSharedMemoryConfig(); /* * Advertise our latch that backends can use to wake us up while we're * sleeping. */ ProcGlobal->checkpointerLatch = &MyProc->procLatch; /* * Loop forever */ for (;;) { bool do_checkpoint = false; int flags = 0; pg_time_t now; int elapsed_secs; int cur_timeout; int rc; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); /* * Process any requests or signals received recently. */ AbsorbFsyncRequests(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* * Checkpointer is the last process to shut down, so we ask it to * hold the keys for a range of other tasks required most of which * have nothing to do with checkpointing at all. * * For various reasons, some config values can change dynamically * so the primary copy of them is held in shared memory to make * sure all backends see the same value. We make Checkpointer * responsible for updating the shared memory copy if the * parameter setting changes because of SIGHUP. */ UpdateSharedMemoryConfig(); } if (checkpoint_requested) { checkpoint_requested = false; do_checkpoint = true; BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); /* Normal exit from the checkpointer is here */ proc_exit(0); /* done */ } /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request, but we set the CAUSE_TIME flag * bit even if there is also an external request. */ now = (pg_time_t) time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) BgWriterStats.m_timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } /* * Do a checkpoint if requested. */ if (do_checkpoint) { bool ckpt_performed = false; bool do_restartpoint; /* * Check if we should perform a checkpoint or a restartpoint. As a * side-effect, RecoveryInProgress() initializes TimeLineID if * it's not set yet. */ do_restartpoint = RecoveryInProgress(); /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter * to acknowledge that we've started a new checkpoint. */ SpinLockAcquire(&CheckpointerShmem->ckpt_lck); flags |= CheckpointerShmem->ckpt_flags; CheckpointerShmem->ckpt_flags = 0; CheckpointerShmem->ckpt_started++; SpinLockRelease(&CheckpointerShmem->ckpt_lck); /* * The end-of-recovery checkpoint is a real checkpoint that's * performed while we're still in recovery. */ if (flags & CHECKPOINT_END_OF_RECOVERY) do_restartpoint = false; /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * since the last checkpoint start. Note in particular that this * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ if (!do_restartpoint && (flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", "checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs, elapsed_secs), errhint("Consider increasing the configuration parameter \"max_wal_size\"."))); /* * Initialize checkpointer-private variables used during * checkpoint. */ ckpt_active = true; if (do_restartpoint) ckpt_start_recptr = GetXLogReplayRecPtr(NULL); else ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ if (!do_restartpoint) { CreateCheckPoint(flags); ckpt_performed = true; } else ckpt_performed = CreateRestartPoint(flags); /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); /* * Indicate checkpoint completion to any waiting backends. */ SpinLockAcquire(&CheckpointerShmem->ckpt_lck); CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; SpinLockRelease(&CheckpointerShmem->ckpt_lck); if (ckpt_performed) { /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven * checkpoints happen at a predictable spacing. */ last_checkpoint_time = now; } else { /* * We were not able to perform the restartpoint (checkpoints * throw an ERROR in case of error). Most likely because we * have not received any new checkpoint WAL records since the * last restartpoint. Try again in 15 s. */ last_checkpoint_time = now - CheckPointTimeout + 15; } ckpt_active = false; } /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); /* * Send off activity statistics to the stats collector. (The reason * why we re-use bgwriter-related code for this is that the bgwriter * and checkpointer used to be just one process. It's probably not * worth the trouble to split the stats support into two independent * stats message types.) */ pgstat_send_bgwriter(); /* * Sleep until we are signaled or it's time for another checkpoint or * xlog file switch. */ now = (pg_time_t) time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) continue; /* no sleep for us ... */ cur_timeout = CheckPointTimeout - elapsed_secs; if (XLogArchiveTimeout > 0 && !RecoveryInProgress()) { elapsed_secs = now - last_xlog_switch_time; if (elapsed_secs >= XLogArchiveTimeout) continue; /* no sleep for us ... */ cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs); } rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, cur_timeout * 1000L /* convert to ms */, WAIT_EVENT_CHECKPOINTER_MAIN); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); } }
/* * Main entry point for syslogger process * argc/argv parameters are valid only in EXEC_BACKEND case. */ NON_EXEC_STATIC void SysLoggerMain(int argc, char *argv[]) { #ifndef WIN32 char logbuffer[READ_BUF_SIZE]; int bytes_in_logbuffer = 0; #endif char *currentLogDir; char *currentLogFilename; int currentLogRotationAge; pg_time_t now; IsUnderPostmaster = true; /* we are a postmaster subprocess now */ MyProcPid = getpid(); /* reset MyProcPid */ MyStartTime = time(NULL); /* set our start time in case we call elog */ now = MyStartTime; #ifdef EXEC_BACKEND syslogger_parseArgs(argc, argv); #endif /* EXEC_BACKEND */ am_syslogger = true; init_ps_display("logger process", "", "", ""); /* * If we restarted, our stderr is already redirected into our own input * pipe. This is of course pretty useless, not to mention that it * interferes with detecting pipe EOF. Point stderr to /dev/null. This * assumes that all interesting messages generated in the syslogger will * come through elog.c and will be sent to write_syslogger_file. */ if (redirection_done) { int fd = open(DEVNULL, O_WRONLY, 0); /* * The closes might look redundant, but they are not: we want to be * darn sure the pipe gets closed even if the open failed. We can * survive running with stderr pointing nowhere, but we can't afford * to have extra pipe input descriptors hanging around. */ close(fileno(stdout)); close(fileno(stderr)); if (fd != -1) { dup2(fd, fileno(stdout)); dup2(fd, fileno(stderr)); close(fd); } } /* * Syslogger's own stderr can't be the syslogPipe, so set it back to text * mode if we didn't just close it. (It was set to binary in * SubPostmasterMain). */ #ifdef WIN32 else _setmode(_fileno(stderr), _O_TEXT); #endif /* * Also close our copy of the write end of the pipe. This is needed to * ensure we can detect pipe EOF correctly. (But note that in the restart * case, the postmaster already did this.) */ #ifndef WIN32 if (syslogPipe[1] >= 0) close(syslogPipe[1]); syslogPipe[1] = -1; #else if (syslogPipe[1]) CloseHandle(syslogPipe[1]); syslogPipe[1] = 0; #endif /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (syslogger probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif InitializeLatchSupport(); /* needed for latch waits */ /* Initialize private latch for use by signal handlers */ InitLatch(&sysLoggerLatch); /* * Properly accept or ignore signals the postmaster might send us * * Note: we ignore all termination signals, and instead exit only when all * upstream processes are gone, to ensure we don't miss any dying gasps of * broken backends... */ pqsignal(SIGHUP, sigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, SIG_IGN); pqsignal(SIGQUIT, SIG_IGN); pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, sigUsr1Handler); /* request log rotation */ pqsignal(SIGUSR2, SIG_IGN); /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); PG_SETMASK(&UnBlockSig); #ifdef WIN32 /* Fire up separate data transfer thread */ InitializeCriticalSection(&sysloggerSection); EnterCriticalSection(&sysloggerSection); threadHandle = (HANDLE) _beginthreadex(NULL, 0, pipeThread, NULL, 0, NULL); if (threadHandle == 0) elog(FATAL, "could not create syslogger data transfer thread: %m"); #endif /* WIN32 */ /* * Remember active logfile's name. We recompute this from the reference * time because passing down just the pg_time_t is a lot cheaper than * passing a whole file path in the EXEC_BACKEND case. */ last_file_name = logfile_getname(first_syslogger_file_time, NULL); /* remember active logfile parameters */ currentLogDir = pstrdup(Log_directory); currentLogFilename = pstrdup(Log_filename); currentLogRotationAge = Log_RotationAge; /* set next planned rotation time */ set_next_rotation_time(); /* main worker loop */ for (;;) { bool time_based_rotation = false; int size_rotation_for = 0; long cur_timeout; int cur_flags; #ifndef WIN32 int rc; #endif /* Clear any already-pending wakeups */ ResetLatch(&sysLoggerLatch); /* * Process any requests or signals received recently. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* * Check if the log directory or filename pattern changed in * postgresql.conf. If so, force rotation to make sure we're * writing the logfiles in the right place. */ if (strcmp(Log_directory, currentLogDir) != 0) { pfree(currentLogDir); currentLogDir = pstrdup(Log_directory); rotation_requested = true; /* * Also, create new directory if not present; ignore errors */ mkdir(Log_directory, S_IRWXU); } if (strcmp(Log_filename, currentLogFilename) != 0) { pfree(currentLogFilename); currentLogFilename = pstrdup(Log_filename); rotation_requested = true; } /* * If rotation time parameter changed, reset next rotation time, * but don't immediately force a rotation. */ if (currentLogRotationAge != Log_RotationAge) { currentLogRotationAge = Log_RotationAge; set_next_rotation_time(); } /* * If we had a rotation-disabling failure, re-enable rotation * attempts after SIGHUP, and force one immediately. */ if (rotation_disabled) { rotation_disabled = false; rotation_requested = true; } } if (Log_RotationAge > 0 && !rotation_disabled) { /* Do a logfile rotation if it's time */ now = (pg_time_t) time(NULL); if (now >= next_rotation_time) rotation_requested = time_based_rotation = true; } if (!rotation_requested && Log_RotationSize > 0 && !rotation_disabled) { /* Do a rotation if file is too big */ if (ftell(syslogFile) >= Log_RotationSize * 1024L) { rotation_requested = true; size_rotation_for |= LOG_DESTINATION_STDERR; } if (csvlogFile != NULL && ftell(csvlogFile) >= Log_RotationSize * 1024L) { rotation_requested = true; size_rotation_for |= LOG_DESTINATION_CSVLOG; } } if (rotation_requested) { /* * Force rotation when both values are zero. It means the request * was sent by pg_rotate_logfile. */ if (!time_based_rotation && size_rotation_for == 0) size_rotation_for = LOG_DESTINATION_STDERR | LOG_DESTINATION_CSVLOG; logfile_rotate(time_based_rotation, size_rotation_for); } /* * Calculate time till next time-based rotation, so that we don't * sleep longer than that. We assume the value of "now" obtained * above is still close enough. Note we can't make this calculation * until after calling logfile_rotate(), since it will advance * next_rotation_time. * * Also note that we need to beware of overflow in calculation of the * timeout: with large settings of Log_RotationAge, next_rotation_time * could be more than INT_MAX msec in the future. In that case we'll * wait no more than INT_MAX msec, and try again. */ if (Log_RotationAge > 0 && !rotation_disabled) { pg_time_t delay; delay = next_rotation_time - now; if (delay > 0) { if (delay > INT_MAX / 1000) delay = INT_MAX / 1000; cur_timeout = delay * 1000L; /* msec */ } else cur_timeout = 0; cur_flags = WL_TIMEOUT; } else { cur_timeout = -1L; cur_flags = 0; } /* * Sleep until there's something to do */ #ifndef WIN32 rc = WaitLatchOrSocket(&sysLoggerLatch, WL_LATCH_SET | WL_SOCKET_READABLE | cur_flags, syslogPipe[0], cur_timeout); if (rc & WL_SOCKET_READABLE) { int bytesRead; bytesRead = read(syslogPipe[0], logbuffer + bytes_in_logbuffer, sizeof(logbuffer) - bytes_in_logbuffer); if (bytesRead < 0) { if (errno != EINTR) ereport(LOG, (errcode_for_socket_access(), errmsg("could not read from logger pipe: %m"))); } else if (bytesRead > 0) { bytes_in_logbuffer += bytesRead; process_pipe_input(logbuffer, &bytes_in_logbuffer); continue; } else { /* * Zero bytes read when select() is saying read-ready means * EOF on the pipe: that is, there are no longer any processes * with the pipe write end open. Therefore, the postmaster * and all backends are shut down, and we are done. */ pipe_eof_seen = true; /* if there's any data left then force it out now */ flush_pipe_input(logbuffer, &bytes_in_logbuffer); } } #else /* WIN32 */ /* * On Windows we leave it to a separate thread to transfer data and * detect pipe EOF. The main thread just wakes up to handle SIGHUP * and rotation conditions. * * Server code isn't generally thread-safe, so we ensure that only one * of the threads is active at a time by entering the critical section * whenever we're not sleeping. */ LeaveCriticalSection(&sysloggerSection); (void) WaitLatch(&sysLoggerLatch, WL_LATCH_SET | cur_flags, cur_timeout); EnterCriticalSection(&sysloggerSection); #endif /* WIN32 */ if (pipe_eof_seen) { /* * seeing this message on the real stderr is annoying - so we make * it DEBUG1 to suppress in normal use. */ ereport(DEBUG1, (errmsg("logger shutting down"))); /* * Normal exit from the syslogger is here. Note that we * deliberately do not close syslogFile before exiting; this is to * allow for the possibility of elog messages being generated * inside proc_exit. Regular exit() will take care of flushing * and closing stdio channels. */ proc_exit(0); } } }
/* * Write bytes into a shared message queue. */ static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, void *data, bool nowait, Size *bytes_written) { shm_mq *mq = mqh->mqh_queue; Size sent = 0; uint64 used; Size ringsize = mq->mq_ring_size; Size available; while (sent < nbytes) { bool detached; uint64 rb; /* Compute number of ring buffer bytes used and available. */ rb = shm_mq_get_bytes_read(mq, &detached); Assert(mq->mq_bytes_written >= rb); used = mq->mq_bytes_written - rb; Assert(used <= ringsize); available = Min(ringsize - used, nbytes - sent); /* Bail out if the queue has been detached. */ if (detached) return SHM_MQ_DETACHED; if (available == 0) { shm_mq_result res; /* * The queue is full, so if the receiver isn't yet known to be * attached, we must wait for that to happen. */ if (!mqh->mqh_counterparty_attached) { if (nowait) { if (shm_mq_get_receiver(mq) == NULL) return SHM_MQ_WOULD_BLOCK; } else if (!shm_mq_wait_internal(mq, &mq->mq_receiver, mqh->mqh_handle)) { mq->mq_detached = true; return SHM_MQ_DETACHED; } mqh->mqh_counterparty_attached = true; } /* Let the receiver know that we need them to read some data. */ res = shm_mq_notify_receiver(mq); if (res != SHM_MQ_SUCCESS) { *bytes_written = sent; return res; } /* Skip manipulation of our latch if nowait = true. */ if (nowait) { *bytes_written = sent; return SHM_MQ_WOULD_BLOCK; } /* * Wait for our latch to be set. It might already be set for * some unrelated reason, but that'll just result in one extra * trip through the loop. It's worth it to avoid resetting the * latch at top of loop, because setting an already-set latch is * much cheaper than setting one that has been reset. */ WaitLatch(&MyProc->procLatch, WL_LATCH_SET, 0); /* An interrupt may have occurred while we were waiting. */ CHECK_FOR_INTERRUPTS(); /* Reset the latch so we don't spin. */ ResetLatch(&MyProc->procLatch); } else { Size offset = mq->mq_bytes_written % (uint64) ringsize; Size sendnow = Min(available, ringsize - offset); /* Write as much data as we can via a single memcpy(). */ memcpy(&mq->mq_ring[mq->mq_ring_offset + offset], (char *) data + sent, sendnow); sent += sendnow; /* * Update count of bytes written, with alignment padding. Note * that this will never actually insert any padding except at the * end of a run of bytes, because the buffer size is a multiple of * MAXIMUM_ALIGNOF, and each read is as well. */ Assert(sent == nbytes || sendnow == MAXALIGN(sendnow)); shm_mq_inc_bytes_written(mq, MAXALIGN(sendnow)); /* * For efficiency, we don't set the reader's latch here. We'll * do that only when the buffer fills up or after writing an * entire message. */ } } *bytes_written = sent; return SHM_MQ_SUCCESS; }
/* * Wait for synchronous replication, if requested by user. * * Initially backends start in state SYNC_REP_NOT_WAITING and then * change that state to SYNC_REP_WAITING before adding ourselves * to the wait queue. During SyncRepWakeQueue() a WALSender changes * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed. * This backend then resets its state to SYNC_REP_NOT_WAITING. */ void SyncRepWaitForLSN(XLogRecPtr XactCommitLSN) { char *new_status = NULL; const char *old_status; /* * Fast exit if user has not requested sync replication, or * there are no sync replication standby names defined. * Note that those standbys don't need to be connected. */ if (!SyncRepRequested() || !SyncStandbysDefined()) return; Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); Assert(WalSndCtl != NULL); /* Reset the latch before adding ourselves to the queue. */ ResetLatch(&MyProc->waitLatch); /* * Set our waitLSN so WALSender will know when to wake us, and add * ourselves to the queue. */ LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING); if (!WalSndCtl->sync_standbys_defined) { /* * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is * not set. See SyncRepUpdateSyncStandbysDefined. */ LWLockRelease(SyncRepLock); return; } MyProc->waitLSN = XactCommitLSN; MyProc->syncRepState = SYNC_REP_WAITING; SyncRepQueueInsert(); Assert(SyncRepQueueIsOrderedByLSN()); LWLockRelease(SyncRepLock); /* Alter ps display to show waiting for sync rep. */ if (update_process_title) { int len; old_status = get_ps_display(&len); new_status = (char *) palloc(len + 32 + 1); memcpy(new_status, old_status, len); sprintf(new_status + len, " waiting for %X/%X", XactCommitLSN.xlogid, XactCommitLSN.xrecoff); set_ps_display(new_status, false); new_status[len] = '\0'; /* truncate off " waiting ..." */ } /* * Wait for specified LSN to be confirmed. * * Each proc has its own wait latch, so we perform a normal latch * check/wait loop here. */ for (;;) { int syncRepState; /* * Wait on latch for up to 60 seconds. This allows us to * check for postmaster death regularly while waiting. * Note that timeout here does not necessarily release from loop. */ WaitLatch(&MyProc->waitLatch, 60000000L); /* Must reset the latch before testing state. */ ResetLatch(&MyProc->waitLatch); /* * Try checking the state without the lock first. There's no guarantee * that we'll read the most up-to-date value, so if it looks like we're * still waiting, recheck while holding the lock. But if it looks like * we're done, we must really be done, because once walsender changes * the state to SYNC_REP_WAIT_COMPLETE, it will never update it again, * so we can't be seeing a stale value in that case. */ syncRepState = MyProc->syncRepState; if (syncRepState == SYNC_REP_WAITING) { LWLockAcquire(SyncRepLock, LW_SHARED); syncRepState = MyProc->syncRepState; LWLockRelease(SyncRepLock); } if (syncRepState == SYNC_REP_WAIT_COMPLETE) break; /* * If a wait for synchronous replication is pending, we can neither * acknowledge the commit nor raise ERROR or FATAL. The latter * would lead the client to believe that that the transaction * aborted, which is not true: it's already committed locally. * The former is no good either: the client has requested * synchronous replication, and is entitled to assume that an * acknowledged commit is also replicated, which may not be true. * So in this case we issue a WARNING (which some clients may * be able to interpret) and shut off further output. We do NOT * reset ProcDiePending, so that the process will die after the * commit is cleaned up. */ if (ProcDiePending) { ereport(WARNING, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"), errdetail("The transaction has already committed locally, but may not have been replicated to the standby."))); whereToSendOutput = DestNone; SyncRepCancelWait(); break; } /* * It's unclear what to do if a query cancel interrupt arrives. We * can't actually abort at this point, but ignoring the interrupt * altogether is not helpful, so we just terminate the wait with * a suitable warning. */ if (QueryCancelPending) { QueryCancelPending = false; ereport(WARNING, (errmsg("canceling wait for synchronous replication due to user request"), errdetail("The transaction has already committed locally, but may not have been replicated to the standby."))); SyncRepCancelWait(); break; } /* * If the postmaster dies, we'll probably never get an acknowledgement, * because all the wal sender processes will exit. So just bail out. */ if (!PostmasterIsAlive(true)) { ProcDiePending = true; whereToSendOutput = DestNone; SyncRepCancelWait(); break; } } /* * WalSender has checked our LSN and has removed us from queue. Clean up * state and leave. It's OK to reset these shared memory fields without * holding SyncRepLock, because any walsenders will ignore us anyway when * we're not on the queue. */ Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); MyProc->syncRepState = SYNC_REP_NOT_WAITING; MyProc->waitLSN.xlogid = 0; MyProc->waitLSN.xrecoff = 0; if (new_status) { /* Reset ps display */ set_ps_display(new_status, false); pfree(new_status); } }
static void worker_spi_main(Datum main_arg) { /* Register functions for SIGTERM/SIGHUP management */ pqsignal(SIGHUP, worker_spi_sighup); pqsignal(SIGTERM, worker_spi_sigterm); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); /* Connect to our database */ BackgroundWorkerInitializeConnection("postgres", NULL); while (!got_sigterm) { int ret; int rc; StringInfoData buf; /* * Background workers mustn't call usleep() or any direct equivalent: * instead, they may wait on their process latch, which sleeps as * necessary, but is awakened if postmaster dies. That way the * background process goes away immediately in an emergency. */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 1000L); ResetLatch(&MyProc->procLatch); /* emergency bailout if postmaster has died */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); initStringInfo(&buf); /* Build the query string */ appendStringInfo(&buf, "SELECT count(*) FROM pg_class;"); ret = SPI_execute(buf.data, true, 0); /* Some error messages in case of incorrect handling */ if (ret != SPI_OK_SELECT) elog(FATAL, "SPI_execute failed: error code %d", ret); if (SPI_processed > 0) { int32 count; bool isnull; count = DatumGetInt32(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull)); elog(LOG, "Currently %d relations in database", count); } SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); } proc_exit(0); }
void worker_spi_main(Datum main_arg) { int index = DatumGetInt32(main_arg); worktable *table; StringInfoData buf; char name[20]; table = palloc(sizeof(worktable)); sprintf(name, "schema%d", index); table->schema = pstrdup(name); table->name = pstrdup("counted"); /* Establish signal handlers before unblocking signals. */ pqsignal(SIGHUP, worker_spi_sighup); pqsignal(SIGTERM, worker_spi_sigterm); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); /* Connect to our database */ BackgroundWorkerInitializeConnection("postgres", NULL); elog(LOG, "%s initialized with %s.%s", MyBgworkerEntry->bgw_name, table->schema, table->name); initialize_worker_spi(table); /* * Quote identifiers passed to us. Note that this must be done after * initialize_worker_spi, because that routine assumes the names are not * quoted. * * Note some memory might be leaked here. */ table->schema = quote_identifier(table->schema); table->name = quote_identifier(table->name); initStringInfo(&buf); appendStringInfo(&buf, "WITH deleted AS (DELETE " "FROM %s.%s " "WHERE type = 'delta' RETURNING value), " "total AS (SELECT coalesce(sum(value), 0) as sum " "FROM deleted) " "UPDATE %s.%s " "SET value = %s.value + total.sum " "FROM total WHERE type = 'total' " "RETURNING %s.value", table->schema, table->name, table->schema, table->name, table->name, table->name); /* * Main loop: do this until the SIGTERM handler tells us to terminate */ while (!got_sigterm) { int ret; int rc; /* * Background workers mustn't call usleep() or any direct equivalent: * instead, they may wait on their process latch, which sleeps as * necessary, but is awakened if postmaster dies. That way the * background process goes away immediately in an emergency. */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, worker_spi_naptime * 1000L); ResetLatch(&MyProc->procLatch); /* emergency bailout if postmaster has died */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); /* * In case of a SIGHUP, just reload the configuration. */ if (got_sighup) { got_sighup = false; ProcessConfigFile(PGC_SIGHUP); } /* * Start a transaction on which we can run queries. Note that each * StartTransactionCommand() call should be preceded by a * SetCurrentStatementStartTimestamp() call, which sets both the time * for the statement we're about the run, and also the transaction * start time. Also, each other query sent to SPI should probably be * preceded by SetCurrentStatementStartTimestamp(), so that statement * start time is always up to date. * * The SPI_connect() call lets us run queries through the SPI manager, * and the PushActiveSnapshot() call creates an "active" snapshot * which is necessary for queries to have MVCC data to work on. * * The pgstat_report_activity() call makes our activity visible * through the pgstat views. */ SetCurrentStatementStartTimestamp(); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); pgstat_report_activity(STATE_RUNNING, buf.data); /* We can now execute queries via SPI */ ret = SPI_execute(buf.data, false, 0); if (ret != SPI_OK_UPDATE_RETURNING) elog(FATAL, "cannot select from table %s.%s: error code %d", table->schema, table->name, ret); if (SPI_processed > 0) { bool isnull; int32 val; val = DatumGetInt32(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull)); if (!isnull) elog(LOG, "%s: count in %s.%s is now %d", MyBgworkerEntry->bgw_name, table->schema, table->name, val); } /* * And finish our transaction. */ SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); pgstat_report_activity(STATE_IDLE, NULL); } proc_exit(1); }
static void kill_idle_main(Datum main_arg) { StringInfoData buf; /* Register functions for SIGTERM/SIGHUP management */ pqsignal(SIGHUP, kill_idle_sighup); pqsignal(SIGTERM, kill_idle_sigterm); /* We're now ready to receive signals */ BackgroundWorkerUnblockSignals(); /* Connect to a database */ BackgroundWorkerInitializeConnection("postgres", NULL); /* Build query for process */ initStringInfo(&buf); kill_idle_build_query(&buf); while (!got_sigterm) { int rc, ret, i; /* Wait necessary amount of time */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, kill_max_idle_time * 1000L, PG_WAIT_EXTENSION); ResetLatch(&MyProc->procLatch); /* Emergency bailout if postmaster has died */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); /* Process signals */ if (got_sighup) { int old_interval; /* Save old value of kill interval */ old_interval = kill_max_idle_time; /* Process config file */ ProcessConfigFile(PGC_SIGHUP); got_sighup = false; ereport(LOG, (errmsg("bgworker kill_idle signal: processed SIGHUP"))); /* Rebuild query if necessary */ if (old_interval != kill_max_idle_time) { resetStringInfo(&buf); initStringInfo(&buf); kill_idle_build_query(&buf); } } if (got_sigterm) { /* Simply exit */ ereport(LOG, (errmsg("bgworker kill_idle signal: processed SIGTERM"))); proc_exit(0); } /* Process idle connection kill */ SetCurrentStatementStartTimestamp(); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); pgstat_report_activity(STATE_RUNNING, buf.data); /* Statement start time */ SetCurrentStatementStartTimestamp(); /* Execute query */ ret = SPI_execute(buf.data, false, 0); /* Some error handling */ if (ret != SPI_OK_SELECT) elog(FATAL, "Error when trying to kill idle connections"); /* Do some processing and log stuff disconnected */ for (i = 0; i < SPI_processed; i++) { int32 pidValue; bool isnull; char *datname = NULL; char *usename = NULL; char *client_addr = NULL; /* Fetch values */ pidValue = DatumGetInt32(SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull)); usename = DatumGetCString(SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 3, &isnull)); datname = DatumGetCString(SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 4, &isnull)); client_addr = DatumGetCString(SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 5, &isnull)); /* Log what has been disconnected */ elog(LOG, "Disconnected idle connection: PID %d %s/%s/%s", pidValue, datname ? datname : "none", usename ? usename : "none", client_addr ? client_addr : "none"); } SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); pgstat_report_activity(STATE_IDLE, NULL); } /* No problems, so clean exit */ proc_exit(0); }
/* * ProcSleep -- put a process to sleep on the specified lock * * Caller must have set MyProc->heldLocks to reflect locks already held * on the lockable object by this process (under all XIDs). * * The lock table's partition lock must be held at entry, and will be held * at exit. * * Result: STATUS_OK if we acquired the lock, STATUS_ERROR if not (deadlock). * * ASSUME: that no one will fiddle with the queue until after * we release the partition lock. * * NOTES: The process queue is now a priority queue for locking. */ int ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) { LOCKMODE lockmode = locallock->tag.mode; LOCK *lock = locallock->lock; PROCLOCK *proclock = locallock->proclock; uint32 hashcode = locallock->hashcode; LWLock *partitionLock = LockHashPartitionLock(hashcode); PROC_QUEUE *waitQueue = &(lock->waitProcs); LOCKMASK myHeldLocks = MyProc->heldLocks; bool early_deadlock = false; bool allow_autovacuum_cancel = true; int myWaitStatus; PGPROC *proc; int i; /* * Determine where to add myself in the wait queue. * * Normally I should go at the end of the queue. However, if I already * hold locks that conflict with the request of any previous waiter, put * myself in the queue just in front of the first such waiter. This is not * a necessary step, since deadlock detection would move me to before that * waiter anyway; but it's relatively cheap to detect such a conflict * immediately, and avoid delaying till deadlock timeout. * * Special case: if I find I should go in front of some waiter, check to * see if I conflict with already-held locks or the requests before that * waiter. If not, then just grant myself the requested lock immediately. * This is the same as the test for immediate grant in LockAcquire, except * we are only considering the part of the wait queue before my insertion * point. */ if (myHeldLocks != 0) { LOCKMASK aheadRequests = 0; proc = (PGPROC *) waitQueue->links.next; for (i = 0; i < waitQueue->size; i++) { /* Must he wait for me? */ if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks) { /* Must I wait for him ? */ if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks) { /* * Yes, so we have a deadlock. Easiest way to clean up * correctly is to call RemoveFromWaitQueue(), but we * can't do that until we are *on* the wait queue. So, set * a flag to check below, and break out of loop. Also, * record deadlock info for later message. */ RememberSimpleDeadLock(MyProc, lockmode, lock, proc); early_deadlock = true; break; } /* I must go before this waiter. Check special case. */ if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 && LockCheckConflicts(lockMethodTable, lockmode, lock, proclock) == STATUS_OK) { /* Skip the wait and just grant myself the lock. */ GrantLock(lock, proclock, lockmode); GrantAwaitedLock(); return STATUS_OK; } /* Break out of loop to put myself before him */ break; } /* Nope, so advance to next waiter */ aheadRequests |= LOCKBIT_ON(proc->waitLockMode); proc = (PGPROC *) proc->links.next; } /* * If we fall out of loop normally, proc points to waitQueue head, so * we will insert at tail of queue as desired. */ } else { /* I hold no locks, so I can't push in front of anyone. */ proc = (PGPROC *) &(waitQueue->links); } /* * Insert self into queue, ahead of the given proc (or at tail of queue). */ SHMQueueInsertBefore(&(proc->links), &(MyProc->links)); waitQueue->size++; lock->waitMask |= LOCKBIT_ON(lockmode); /* Set up wait information in PGPROC object, too */ MyProc->waitLock = lock; MyProc->waitProcLock = proclock; MyProc->waitLockMode = lockmode; MyProc->waitStatus = STATUS_WAITING; /* * If we detected deadlock, give up without waiting. This must agree with * CheckDeadLock's recovery code, except that we shouldn't release the * semaphore since we haven't tried to lock it yet. */ if (early_deadlock) { RemoveFromWaitQueue(MyProc, hashcode); return STATUS_ERROR; } /* mark that we are waiting for a lock */ lockAwaited = locallock; /* * Release the lock table's partition lock. * * NOTE: this may also cause us to exit critical-section state, possibly * allowing a cancel/die interrupt to be accepted. This is OK because we * have recorded the fact that we are waiting for a lock, and so * LockErrorCleanup will clean up if cancel/die happens. */ LWLockRelease(partitionLock); /* * Also, now that we will successfully clean up after an ereport, it's * safe to check to see if there's a buffer pin deadlock against the * Startup process. Of course, that's only necessary if we're doing Hot * Standby and are not the Startup process ourselves. */ if (RecoveryInProgress() && !InRecovery) CheckRecoveryConflictDeadlock(); /* Reset deadlock_state before enabling the timeout handler */ deadlock_state = DS_NOT_YET_CHECKED; got_deadlock_timeout = false; /* * Set timer so we can wake up after awhile and check for a deadlock. If a * deadlock is detected, the handler releases the process's semaphore and * sets MyProc->waitStatus = STATUS_ERROR, allowing us to know that we * must report failure rather than success. * * By delaying the check until we've waited for a bit, we can avoid * running the rather expensive deadlock-check code in most cases. * * If LockTimeout is set, also enable the timeout for that. We can save a * few cycles by enabling both timeout sources in one call. */ if (LockTimeout > 0) { EnableTimeoutParams timeouts[2]; timeouts[0].id = DEADLOCK_TIMEOUT; timeouts[0].type = TMPARAM_AFTER; timeouts[0].delay_ms = DeadlockTimeout; timeouts[1].id = LOCK_TIMEOUT; timeouts[1].type = TMPARAM_AFTER; timeouts[1].delay_ms = LockTimeout; enable_timeouts(timeouts, 2); } else enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout); /* * If somebody wakes us between LWLockRelease and WaitLatch, the latch * will not wait. But a set latch does not necessarily mean that the lock * is free now, as there are many other sources for latch sets than * somebody releasing the lock. * * We process interrupts whenever the latch has been set, so cancel/die * interrupts are processed quickly. This means we must not mind losing * control to a cancel/die interrupt here. We don't, because we have no * shared-state-change work to do after being granted the lock (the * grantor did it all). We do have to worry about canceling the deadlock * timeout and updating the locallock table, but if we lose control to an * error, LockErrorCleanup will fix that up. */ do { WaitLatch(MyLatch, WL_LATCH_SET, 0); ResetLatch(MyLatch); /* check for deadlocks first, as that's probably log-worthy */ if (got_deadlock_timeout) { CheckDeadLock(); got_deadlock_timeout = false; } CHECK_FOR_INTERRUPTS(); /* * waitStatus could change from STATUS_WAITING to something else * asynchronously. Read it just once per loop to prevent surprising * behavior (such as missing log messages). */ myWaitStatus = *((volatile int *) &MyProc->waitStatus); /* * If we are not deadlocked, but are waiting on an autovacuum-induced * task, send a signal to interrupt it. */ if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel) { PGPROC *autovac = GetBlockingAutoVacuumPgproc(); PGXACT *autovac_pgxact = &ProcGlobal->allPgXact[autovac->pgprocno]; LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); /* * Only do it if the worker is not working to protect against Xid * wraparound. */ if ((autovac_pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) && !(autovac_pgxact->vacuumFlags & PROC_VACUUM_FOR_WRAPAROUND)) { int pid = autovac->pid; StringInfoData locktagbuf; StringInfoData logbuf; /* errdetail for server log */ initStringInfo(&locktagbuf); initStringInfo(&logbuf); DescribeLockTag(&locktagbuf, &lock->tag); appendStringInfo(&logbuf, _("Process %d waits for %s on %s."), MyProcPid, GetLockmodeName(lock->tag.locktag_lockmethodid, lockmode), locktagbuf.data); /* release lock as quickly as possible */ LWLockRelease(ProcArrayLock); ereport(LOG, (errmsg("sending cancel to blocking autovacuum PID %d", pid), errdetail_log("%s", logbuf.data))); pfree(logbuf.data); pfree(locktagbuf.data); /* send the autovacuum worker Back to Old Kent Road */ if (kill(pid, SIGINT) < 0) { /* Just a warning to allow multiple callers */ ereport(WARNING, (errmsg("could not send signal to process %d: %m", pid))); } } else LWLockRelease(ProcArrayLock); /* prevent signal from being resent more than once */ allow_autovacuum_cancel = false; } /* * If awoken after the deadlock check interrupt has run, and * log_lock_waits is on, then report about the wait. */ if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED) { StringInfoData buf, lock_waiters_sbuf, lock_holders_sbuf; const char *modename; long secs; int usecs; long msecs; SHM_QUEUE *procLocks; PROCLOCK *proclock; bool first_holder = true, first_waiter = true; int lockHoldersNum = 0; initStringInfo(&buf); initStringInfo(&lock_waiters_sbuf); initStringInfo(&lock_holders_sbuf); DescribeLockTag(&buf, &locallock->tag.lock); modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid, lockmode); TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT), GetCurrentTimestamp(), &secs, &usecs); msecs = secs * 1000 + usecs / 1000; usecs = usecs % 1000; /* * we loop over the lock's procLocks to gather a list of all * holders and waiters. Thus we will be able to provide more * detailed information for lock debugging purposes. * * lock->procLocks contains all processes which hold or wait for * this lock. */ LWLockAcquire(partitionLock, LW_SHARED); procLocks = &(lock->procLocks); proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, lockLink)); while (proclock) { /* * we are a waiter if myProc->waitProcLock == proclock; we are * a holder if it is NULL or something different */ if (proclock->tag.myProc->waitProcLock == proclock) { if (first_waiter) { appendStringInfo(&lock_waiters_sbuf, "%d", proclock->tag.myProc->pid); first_waiter = false; } else appendStringInfo(&lock_waiters_sbuf, ", %d", proclock->tag.myProc->pid); } else { if (first_holder) { appendStringInfo(&lock_holders_sbuf, "%d", proclock->tag.myProc->pid); first_holder = false; } else appendStringInfo(&lock_holders_sbuf, ", %d", proclock->tag.myProc->pid); lockHoldersNum++; } proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink, offsetof(PROCLOCK, lockLink)); } LWLockRelease(partitionLock); if (deadlock_state == DS_SOFT_DEADLOCK) ereport(LOG, (errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); else if (deadlock_state == DS_HARD_DEADLOCK) { /* * This message is a bit redundant with the error that will be * reported subsequently, but in some cases the error report * might not make it to the log (eg, if it's caught by an * exception handler), and we want to ensure all long-wait * events get logged. */ ereport(LOG, (errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); } if (myWaitStatus == STATUS_WAITING) ereport(LOG, (errmsg("process %d still waiting for %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); else if (myWaitStatus == STATUS_OK) ereport(LOG, (errmsg("process %d acquired %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs))); else { Assert(myWaitStatus == STATUS_ERROR); /* * Currently, the deadlock checker always kicks its own * process, which means that we'll only see STATUS_ERROR when * deadlock_state == DS_HARD_DEADLOCK, and there's no need to * print redundant messages. But for completeness and * future-proofing, print a message if it looks like someone * else kicked us off the lock. */ if (deadlock_state != DS_HARD_DEADLOCK) ereport(LOG, (errmsg("process %d failed to acquire %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); } /* * At this point we might still need to wait for the lock. Reset * state so we don't print the above messages again. */ deadlock_state = DS_NO_DEADLOCK; pfree(buf.data); pfree(lock_holders_sbuf.data); pfree(lock_waiters_sbuf.data); } } while (myWaitStatus == STATUS_WAITING); /* * Disable the timers, if they are still running. As in LockErrorCleanup, * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has * already caused QueryCancelPending to become set, we want the cancel to * be reported as a lock timeout, not a user cancel. */ if (LockTimeout > 0) { DisableTimeoutParams timeouts[2]; timeouts[0].id = DEADLOCK_TIMEOUT; timeouts[0].keep_indicator = false; timeouts[1].id = LOCK_TIMEOUT; timeouts[1].keep_indicator = true; disable_timeouts(timeouts, 2); } else disable_timeout(DEADLOCK_TIMEOUT, false); /* * Re-acquire the lock table's partition lock. We have to do this to hold * off cancel/die interrupts before we can mess with lockAwaited (else we * might have a missed or duplicated locallock update). */ LWLockAcquire(partitionLock, LW_EXCLUSIVE); /* * We no longer want LockErrorCleanup to do anything. */ lockAwaited = NULL; /* * If we got the lock, be sure to remember it in the locallock table. */ if (MyProc->waitStatus == STATUS_OK) GrantAwaitedLock(); /* * We don't have to do anything else, because the awaker did all the * necessary update of the lock table and MyProc. */ return MyProc->waitStatus; }
/* * Attempt to read a tuple from one of our parallel workers. */ static HeapTuple gather_readnext(GatherState *gatherstate) { int waitpos = gatherstate->nextreader; for (;;) { TupleQueueReader *reader; HeapTuple tup; bool readerdone; /* Make sure we've read all messages from workers. */ HandleParallelMessages(); /* Attempt to read a tuple, but don't block if none is available. */ reader = gatherstate->reader[gatherstate->nextreader]; tup = TupleQueueReaderNext(reader, true, &readerdone); /* * If this reader is done, remove it. If all readers are done, * clean up remaining worker state. */ if (readerdone) { DestroyTupleQueueReader(reader); --gatherstate->nreaders; if (gatherstate->nreaders == 0) { ExecShutdownGatherWorkers(gatherstate); return NULL; } else { memmove(&gatherstate->reader[gatherstate->nextreader], &gatherstate->reader[gatherstate->nextreader + 1], sizeof(TupleQueueReader *) * (gatherstate->nreaders - gatherstate->nextreader)); if (gatherstate->nextreader >= gatherstate->nreaders) gatherstate->nextreader = 0; if (gatherstate->nextreader < waitpos) --waitpos; } continue; } /* If we got a tuple, return it. */ if (tup) return tup; /* * Advance nextreader pointer in round-robin fashion. Note that we * only reach this code if we weren't able to get a tuple from the * current worker. We used to advance the nextreader pointer after * every tuple, but it turns out to be much more efficient to keep * reading from the same queue until that would require blocking. */ gatherstate->nextreader = (gatherstate->nextreader + 1) % gatherstate->nreaders; /* Have we visited every TupleQueueReader? */ if (gatherstate->nextreader == waitpos) { /* * If (still) running plan locally, return NULL so caller can * generate another tuple from the local copy of the plan. */ if (gatherstate->need_to_scan_locally) return NULL; /* Nothing to do except wait for developments. */ WaitLatch(MyLatch, WL_LATCH_SET, 0); CHECK_FOR_INTERRUPTS(); ResetLatch(MyLatch); } } }