/* * Send reply message to primary, indicating our current XLOG positions and * the current time. */ static void XLogWalRcvSendReply(void) { char buf[sizeof(StandbyReplyMessage) + 1]; TimestampTz now; /* * If the user doesn't want status to be reported to the master, be sure * to exit before doing anything at all. */ if (wal_receiver_status_interval <= 0) return; /* Get current timestamp. */ now = GetCurrentTimestamp(); /* * We can compare the write and flush positions to the last message we * sent without taking any lock, but the apply position requires a spin * lock, so we don't check that unless something else has changed or 10 * seconds have passed. This means that the apply log position will * appear, from the master's point of view, to lag slightly, but since * this is only for reporting purposes and only on idle systems, that's * probably OK. */ if (XLByteEQ(reply_message.write, LogstreamResult.Write) && XLByteEQ(reply_message.flush, LogstreamResult.Flush) && !TimestampDifferenceExceeds(reply_message.sendTime, now, wal_receiver_status_interval * 1000)) return; /* Construct a new message. */ reply_message.write = LogstreamResult.Write; reply_message.flush = LogstreamResult.Flush; reply_message.apply = GetXLogReplayRecPtr(); reply_message.sendTime = now; elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X", reply_message.write.xlogid, reply_message.write.xrecoff, reply_message.flush.xlogid, reply_message.flush.xrecoff, reply_message.apply.xlogid, reply_message.apply.xrecoff); /* Prepend with the message type and send it. */ buf[0] = 'r'; memcpy(&buf[1], &reply_message, sizeof(StandbyReplyMessage)); walrcv_send(buf, sizeof(StandbyReplyMessage) + 1); }
/* * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs. * We wait here for a while then return. If we decide we can't wait any * more then we return true, if we can wait some more return false. */ static bool WaitExceedsMaxStandbyDelay(void) { /* Are we past max_standby_delay? */ if (MaxStandbyDelay >= 0 && TimestampDifferenceExceeds(GetLatestXLogTime(), GetCurrentTimestamp(), MaxStandbyDelay)) return true; /* * Sleep a bit (this is essential to avoid busy-waiting). */ pg_usleep(standbyWait_us); /* * Progressively increase the sleep times, but not to more than 1s, * since pg_usleep isn't interruptable on some platforms. */ standbyWait_us *= 2; if (standbyWait_us > 1000000) standbyWait_us = 1000000; return false; }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. */ void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; bool scan_all; TransactionId freezeTableLimit; pg_rusage_init(&ru0); /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0) starttime = GetCurrentTimestamp(); if (vacstmt->options & VACOPT_VERBOSE) elevel = INFO; else elevel = DEBUG2; vac_strategy = bstrategy; vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit, &freezeTableLimit); scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid, freezeTableLimit); vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) lazy_truncate_heap(onerel, vacrelstats); /* Vacuum the Free Space Map */ FreeSpaceMapVacuum(onerel); /* * Update statistics in pg_class. But don't change relfrozenxid if we * skipped any pages. */ vac_update_relstats(onerel, vacrelstats->rel_pages, vacrelstats->new_rel_tuples, vacrelstats->hasindex, (vacrelstats->scanned_pages < vacrelstats->rel_pages) ? InvalidTransactionId : FreezeLimit); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, vacrelstats->new_rel_tuples); /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { if (Log_autovacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(), Log_autovacuum_min_duration)) ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" "pages: %d removed, %d remain\n" "tuples: %.0f removed, %.0f remain\n" "system usage: %s", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, vacrelstats->pages_removed, vacrelstats->rel_pages, vacrelstats->tuples_deleted, vacrelstats->new_rel_tuples, pg_rusage_show(&ru0)))); } }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. */ void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; long secs; int usecs; double read_rate, write_rate; bool scan_all; TransactionId freezeTableLimit; BlockNumber new_rel_pages; double new_rel_tuples; BlockNumber new_rel_allvisible; TransactionId new_frozen_xid; /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { pg_rusage_init(&ru0); starttime = GetCurrentTimestamp(); } if (vacstmt->options & VACOPT_VERBOSE) elevel = INFO; else elevel = DEBUG2; vac_strategy = bstrategy; vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit, &freezeTableLimit); scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid, freezeTableLimit); vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->old_rel_pages = onerel->rd_rel->relpages; vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) lazy_truncate_heap(onerel, vacrelstats); /* Vacuum the Free Space Map */ FreeSpaceMapVacuum(onerel); /* * Update statistics in pg_class. * * A corner case here is that if we scanned no pages at all because every * page is all-visible, we should not update relpages/reltuples, because * we have no new information to contribute. In particular this keeps * us from replacing relpages=reltuples=0 (which means "unknown tuple * density") with nonzero relpages and reltuples=0 (which means "zero * tuple density") unless there's some actual evidence for the latter. * * We do update relallvisible even in the corner case, since if the * table is all-visible we'd definitely like to know that. But clamp * the value to be not more than what we're setting relpages to. * * Also, don't change relfrozenxid if we skipped any pages, since then * we don't know for certain that all tuples have a newer xmin. */ new_rel_pages = vacrelstats->rel_pages; new_rel_tuples = vacrelstats->new_rel_tuples; if (vacrelstats->scanned_pages == 0 && new_rel_pages > 0) { new_rel_pages = vacrelstats->old_rel_pages; new_rel_tuples = vacrelstats->old_rel_tuples; } new_rel_allvisible = visibilitymap_count(onerel); if (new_rel_allvisible > new_rel_pages) new_rel_allvisible = new_rel_pages; new_frozen_xid = FreezeLimit; if (vacrelstats->scanned_pages < vacrelstats->rel_pages) new_frozen_xid = InvalidTransactionId; vac_update_relstats(onerel, new_rel_pages, new_rel_tuples, new_rel_allvisible, vacrelstats->hasindex, new_frozen_xid); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, new_rel_tuples); /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { TimestampTz endtime = GetCurrentTimestamp(); if (Log_autovacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, Log_autovacuum_min_duration)) { TimestampDifference(starttime, endtime, &secs, &usecs); read_rate = 0; write_rate = 0; if ((secs > 0) || (usecs > 0)) { read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) / (secs + usecs / 1000000.0); write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) / (secs + usecs / 1000000.0); } ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" "pages: %d removed, %d remain\n" "tuples: %.0f removed, %.0f remain\n" "buffer usage: %d hits, %d misses, %d dirtied\n" "avg read rate: %.3f MiB/s, avg write rate: %.3f MiB/s\n" "system usage: %s", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, vacrelstats->pages_removed, vacrelstats->rel_pages, vacrelstats->tuples_deleted, vacrelstats->new_rel_tuples, VacuumPageHit, VacuumPageMiss, VacuumPageDirty, read_rate,write_rate, pg_rusage_show(&ru0)))); } } }
/* * This is the main executioner for any query backend that conflicts with * recovery processing. Judgement has already been passed on it within * a specific rmgr. Here we just issue the orders to the procs. The procs * then throw the required error as instructed. */ static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, ProcSignalReason reason) { TimestampTz waitStart; char *new_status; /* Fast exit, to avoid a kernel call if there's no work to be done. */ if (!VirtualTransactionIdIsValid(*waitlist)) return; waitStart = GetCurrentTimestamp(); new_status = NULL; /* we haven't changed the ps display */ while (VirtualTransactionIdIsValid(*waitlist)) { /* reset standbyWait_us for each xact we wait for */ standbyWait_us = STANDBY_INITIAL_WAIT_US; /* wait until the virtual xid is gone */ while (!ConditionalVirtualXactLockTableWait(*waitlist)) { /* * Report via ps if we have been waiting for more than 500 msec * (should that be configurable?) */ if (update_process_title && new_status == NULL && TimestampDifferenceExceeds(waitStart, GetCurrentTimestamp(), 500)) { const char *old_status; int len; old_status = get_ps_display(&len); new_status = (char *) palloc(len + 8 + 1); memcpy(new_status, old_status, len); strcpy(new_status + len, " waiting"); set_ps_display(new_status, false); new_status[len] = '\0'; /* truncate off " waiting" */ } /* Is it time to kill it? */ if (WaitExceedsMaxStandbyDelay()) { pid_t pid; /* * Now find out who to throw out of the balloon. */ Assert(VirtualTransactionIdIsValid(*waitlist)); pid = CancelVirtualTransaction(*waitlist, reason); /* * Wait a little bit for it to die so that we avoid flooding * an unresponsive backend when system is heavily loaded. */ if (pid != 0) pg_usleep(5000L); } } /* The virtual transaction is gone now, wait for the next one */ waitlist++; } /* Reset ps display if we changed it */ if (new_status) { set_ps_display(new_status, false); pfree(new_status); } }
/* * Start new apply background worker, if possible. */ void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, Oid relid) { BackgroundWorker bgw; BackgroundWorkerHandle *bgw_handle; uint16 generation; int i; int slot = 0; LogicalRepWorker *worker = NULL; int nsyncworkers; TimestampTz now; ereport(DEBUG1, (errmsg("starting logical replication worker for subscription \"%s\"", subname))); /* Report this after the initial starting message for consistency. */ if (max_replication_slots == 0) ereport(ERROR, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), errmsg("cannot start logical replication workers when max_replication_slots = 0"))); /* * We need to do the modification of the shared memory under lock so that * we have consistent view. */ LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); retry: /* Find unused worker slot. */ for (i = 0; i < max_logical_replication_workers; i++) { LogicalRepWorker *w = &LogicalRepCtx->workers[i]; if (!w->in_use) { worker = w; slot = i; break; } } nsyncworkers = logicalrep_sync_worker_count(subid); now = GetCurrentTimestamp(); /* * If we didn't find a free slot, try to do garbage collection. The * reason we do this is because if some worker failed to start up and its * parent has crashed while waiting, the in_use state was never cleared. */ if (worker == NULL || nsyncworkers >= max_sync_workers_per_subscription) { bool did_cleanup = false; for (i = 0; i < max_logical_replication_workers; i++) { LogicalRepWorker *w = &LogicalRepCtx->workers[i]; /* * If the worker was marked in use but didn't manage to attach in * time, clean it up. */ if (w->in_use && !w->proc && TimestampDifferenceExceeds(w->launch_time, now, wal_receiver_timeout)) { elog(WARNING, "logical replication worker for subscription %u took too long to start; canceled", w->subid); logicalrep_worker_cleanup(w); did_cleanup = true; } } if (did_cleanup) goto retry; } /* * If we reached the sync worker limit per subscription, just exit * silently as we might get here because of an otherwise harmless race * condition. */ if (nsyncworkers >= max_sync_workers_per_subscription) { LWLockRelease(LogicalRepWorkerLock); return; } /* * However if there are no more free worker slots, inform user about it * before exiting. */ if (worker == NULL) { LWLockRelease(LogicalRepWorkerLock); ereport(WARNING, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), errmsg("out of logical replication worker slots"), errhint("You might need to increase max_logical_replication_workers."))); return; } /* Prepare the worker slot. */ worker->launch_time = now; worker->in_use = true; worker->generation++; worker->proc = NULL; worker->dbid = dbid; worker->userid = userid; worker->subid = subid; worker->relid = relid; worker->relstate = SUBREL_STATE_UNKNOWN; worker->relstate_lsn = InvalidXLogRecPtr; worker->last_lsn = InvalidXLogRecPtr; TIMESTAMP_NOBEGIN(worker->last_send_time); TIMESTAMP_NOBEGIN(worker->last_recv_time); worker->reply_lsn = InvalidXLogRecPtr; TIMESTAMP_NOBEGIN(worker->reply_time); /* Before releasing lock, remember generation for future identification. */ generation = worker->generation; LWLockRelease(LogicalRepWorkerLock); /* Register the new dynamic worker. */ memset(&bgw, 0, sizeof(bgw)); bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ApplyWorkerMain"); if (OidIsValid(relid)) snprintf(bgw.bgw_name, BGW_MAXLEN, "logical replication worker for subscription %u sync %u", subid, relid); else snprintf(bgw.bgw_name, BGW_MAXLEN, "logical replication worker for subscription %u", subid); snprintf(bgw.bgw_type, BGW_MAXLEN, "logical replication worker"); bgw.bgw_restart_time = BGW_NEVER_RESTART; bgw.bgw_notify_pid = MyProcPid; bgw.bgw_main_arg = Int32GetDatum(slot); if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) { /* Failed to start worker, so clean up the worker slot. */ LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); Assert(generation == worker->generation); logicalrep_worker_cleanup(worker); LWLockRelease(LogicalRepWorkerLock); ereport(WARNING, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), errmsg("out of background worker slots"), errhint("You might need to increase max_worker_processes."))); return; } /* Now wait until it attaches. */ WaitForReplicationWorkerAttach(worker, generation, bgw_handle); }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. * * The return value indicates whether this function has held off * interrupts -- caller must RESUME_INTERRUPTS() after commit if true. */ bool lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy, List *updated_stats) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; bool heldoff = false; pg_rusage_init(&ru0); /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0) starttime = GetCurrentTimestamp(); if (vacstmt->verbose) elevel = INFO; else elevel = DEBUG2; if (Gp_role == GP_ROLE_DISPATCH) elevel = DEBUG2; /* vacuum and analyze messages aren't interesting from the QD */ #ifdef FAULT_INJECTOR if (vacuumStatement_IsInAppendOnlyDropPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeSegmentFileDropPhase, DDLNotSpecified, "", // databaseName ""); // tableName } if (vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeCleanupPhase, DDLNotSpecified, "", // databaseName ""); // tableName } #endif /* * MPP-23647. Update xid limits for heap as well as appendonly * relations. This allows setting relfrozenxid to correct value * for an appendonly (AO/CO) table. */ vac_strategy = bstrategy; vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit); /* * Execute the various vacuum operations. Appendonly tables are treated * differently. */ if (RelationIsAoRows(onerel) || RelationIsAoCols(onerel)) { lazy_vacuum_aorel(onerel, vacstmt, updated_stats); return false; } vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); /* heap relation */ /* Set threshold for interesting free space = average request size */ /* XXX should we scale it up or down? Adjust vacuum.c too, if so */ vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node); vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, updated_stats); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. * * Note that after we've truncated the heap, it's too late to abort the * transaction; doing so would lose the sinval messages needed to tell * the other backends about the table being shrunk. We prevent interrupts * in that case; caller is responsible for re-enabling them after * committing the transaction. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) { HOLD_INTERRUPTS(); heldoff = true; lazy_truncate_heap(onerel, vacrelstats); } /* Update shared free space map with final free space info */ lazy_update_fsm(onerel, vacrelstats); if (vacrelstats->tot_free_pages > MaxFSMPages) ereport(WARNING, (errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space", get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel)), /* Only suggest VACUUM FULL if > 20% free */ (vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20) ? errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\".") : errhint("Consider increasing the configuration parameter \"max_fsm_pages\"."))); /* Update statistics in pg_class */ vac_update_relstats_from_list(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, vacrelstats->hasindex, FreezeLimit, updated_stats); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, true /*vacrelstats->scanned_all*/, vacstmt->analyze, vacrelstats->rel_tuples); if (gp_indexcheck_vacuum == INDEX_CHECK_ALL || (gp_indexcheck_vacuum == INDEX_CHECK_SYSTEM && PG_CATALOG_NAMESPACE == RelationGetNamespace(onerel))) { int i; for (i = 0; i < nindexes; i++) { if (Irel[i]->rd_rel->relam == BTREE_AM_OID) _bt_validate_vacuum(Irel[i], onerel, OldestXmin); } } /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { if (Log_autovacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(), Log_autovacuum_min_duration)) ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" "pages: %d removed, %d remain\n" "tuples: %.0f removed, %.0f remain\n" "system usage: %s", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, vacrelstats->pages_removed, vacrelstats->rel_pages, vacrelstats->tuples_deleted, vacrelstats->rel_tuples, pg_rusage_show(&ru0)))); } return heldoff; }
/* * ContinuousQueryWorkerStartup * * Launches a CQ worker, which continuously generates partial query results to send * back to the combiner process. */ void ContinuousQueryWorkerRun(Portal portal, ContinuousViewState *state, QueryDesc *queryDesc, ResourceOwner owner) { EState *estate = NULL; DestReceiver *dest; CmdType operation; MemoryContext oldcontext; int timeoutms = state->maxwaitms; MemoryContext runcontext; CQProcEntry *entry = GetCQProcEntry(MyCQId); ResourceOwner cqowner = ResourceOwnerCreate(NULL, "CQResourceOwner"); bool savereadonly = XactReadOnly; cq_stat_initialize(state->viewid, MyProcPid); dest = CreateDestReceiver(DestCombiner); SetCombinerDestReceiverParams(dest, MyCQId); /* workers only need read-only transactions */ XactReadOnly = true; runcontext = AllocSetContextCreate(TopMemoryContext, "CQRunContext", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); elog(LOG, "\"%s\" worker %d running", queryDesc->plannedstmt->cq_target->relname, MyProcPid); MarkWorkerAsRunning(MyCQId, MyWorkerId); pgstat_report_activity(STATE_RUNNING, queryDesc->sourceText); TupleBufferInitLatch(WorkerTupleBuffer, MyCQId, MyWorkerId, &MyProc->procLatch); oldcontext = MemoryContextSwitchTo(runcontext); retry: PG_TRY(); { bool xact_commit = true; TimestampTz last_process = GetCurrentTimestamp(); TimestampTz last_commit = GetCurrentTimestamp(); start_executor(queryDesc, runcontext, cqowner); CurrentResourceOwner = cqowner; estate = queryDesc->estate; operation = queryDesc->operation; /* * Initialize context that lives for the duration of a single iteration * of the main worker loop */ CQExecutionContext = AllocSetContextCreate(estate->es_query_cxt, "CQExecutionContext", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); estate->es_lastoid = InvalidOid; /* * Startup combiner receiver */ (*dest->rStartup) (dest, operation, queryDesc->tupDesc); for (;;) { if (!TupleBufferHasUnreadSlots()) { if (TimestampDifferenceExceeds(last_process, GetCurrentTimestamp(), state->emptysleepms)) { /* force stats flush */ cq_stat_report(true); pgstat_report_activity(STATE_IDLE, queryDesc->sourceText); TupleBufferWait(WorkerTupleBuffer, MyCQId, MyWorkerId); pgstat_report_activity(STATE_RUNNING, queryDesc->sourceText); } else pg_usleep(Min(WAIT_SLEEP_MS, state->emptysleepms) * 1000); } TupleBufferResetNotify(WorkerTupleBuffer, MyCQId, MyWorkerId); if (xact_commit) StartTransactionCommand(); set_snapshot(estate, cqowner); CurrentResourceOwner = cqowner; MemoryContextSwitchTo(estate->es_query_cxt); estate->es_processed = 0; estate->es_filtered = 0; /* * Run plan on a microbatch */ ExecutePlan(estate, queryDesc->planstate, operation, true, 0, timeoutms, ForwardScanDirection, dest); IncrementCQExecutions(1); TupleBufferClearPinnedSlots(); if (state->long_xact) { if (TimestampDifferenceExceeds(last_commit, GetCurrentTimestamp(), LONG_RUNNING_XACT_DURATION)) xact_commit = true; else xact_commit = false; } unset_snapshot(estate, cqowner); if (xact_commit) { CommitTransactionCommand(); last_commit = GetCurrentTimestamp(); } MemoryContextResetAndDeleteChildren(CQExecutionContext); MemoryContextSwitchTo(runcontext); CurrentResourceOwner = cqowner; if (estate->es_processed || estate->es_filtered) { /* * If the CV query is such that the select does not return any tuples * ex: select id where id=99; and id=99 does not exist, then this reset * will fail. What will happen is that the worker will block at the latch for every * allocated slot, TILL a cv returns a non-zero tuple, at which point * the worker will resume a simple sleep for the threshold time. */ last_process = GetCurrentTimestamp(); /* * Send stats to the collector */ cq_stat_report(false); } /* Has the CQ been deactivated? */ if (!entry->active) { if (ActiveSnapshotSet()) unset_snapshot(estate, cqowner); if (IsTransactionState()) CommitTransactionCommand(); break; } } CurrentResourceOwner = cqowner; /* * The cleanup functions below expect these things to be registered */ RegisterSnapshotOnOwner(estate->es_snapshot, cqowner); RegisterSnapshotOnOwner(queryDesc->snapshot, cqowner); RegisterSnapshotOnOwner(queryDesc->crosscheck_snapshot, cqowner); /* cleanup */ ExecutorFinish(queryDesc); ExecutorEnd(queryDesc); FreeQueryDesc(queryDesc); } PG_CATCH(); { EmitErrorReport(); FlushErrorState(); /* Since the worker is read-only, we can simply commit the transaction. */ if (ActiveSnapshotSet()) unset_snapshot(estate, cqowner); if (IsTransactionState()) CommitTransactionCommand(); TupleBufferUnpinAllPinnedSlots(); TupleBufferClearReaders(); /* This resets the es_query_ctx and in turn the CQExecutionContext */ MemoryContextResetAndDeleteChildren(runcontext); IncrementCQErrors(1); if (continuous_query_crash_recovery) goto retry; } PG_END_TRY(); (*dest->rShutdown) (dest); MemoryContextSwitchTo(oldcontext); MemoryContextDelete(runcontext); XactReadOnly = savereadonly; /* * Remove proc-level stats */ cq_stat_report(true); cq_stat_send_purge(state->viewid, MyProcPid, CQ_STAT_WORKER); CurrentResourceOwner = owner; }
/* * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup() * to resolve conflicts with other backends holding buffer pins. * * We either resolve conflicts immediately or set a SIGALRM to wake us at * the limit of our patience. The sleep in LockBufferForCleanup() is * performed here, for code clarity. * * Resolve conflict by sending a SIGUSR1 reason to all backends to check if * they hold one of the buffer pins that is blocking Startup process. If so, * backends will take an appropriate error action, ERROR or FATAL. * * We also check for deadlocks before we wait, though applications that cause * these will be extremely rare. Deadlocks occur because if queries * wait on a lock, that must be behind an AccessExclusiveLock, which can only * be cleared if the Startup process replays a transaction completion record. * If Startup process is also waiting then that is a deadlock. The deadlock * can occur if the query is waiting and then the Startup sleeps, or if * Startup is sleeping and the query waits on a lock. We protect against * only the former sequence here, the latter sequence is checked prior to * the query sleeping, in CheckRecoveryConflictDeadlock(). */ void ResolveRecoveryConflictWithBufferPin(void) { bool sig_alarm_enabled = false; Assert(InHotStandby); if (MaxStandbyDelay == 0) { /* * We don't want to wait, so just tell everybody holding the pin to * get out of town. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); } else if (MaxStandbyDelay < 0) { /* * Send out a request to check for buffer pin deadlocks before we * wait. This is fairly cheap, so no need to wait for deadlock timeout * before trying to send it out. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); } else { TimestampTz then = GetLatestXLogTime(); TimestampTz now = GetCurrentTimestamp(); /* Are we past max_standby_delay? */ if (TimestampDifferenceExceeds(then, now, MaxStandbyDelay)) { /* * We're already behind, so clear a path as quickly as possible. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); } else { TimestampTz fin_time; /* Expected wake-up time by timer */ long timer_delay_secs; /* Amount of time we set timer * for */ int timer_delay_usecs; /* * Send out a request to check for buffer pin deadlocks before we * wait. This is fairly cheap, so no need to wait for deadlock * timeout before trying to send it out. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); /* * How much longer we should wait? */ fin_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay); TimestampDifference(now, fin_time, &timer_delay_secs, &timer_delay_usecs); /* * It's possible that the difference is less than a microsecond; * ensure we don't cancel, rather than set, the interrupt. */ if (timer_delay_secs == 0 && timer_delay_usecs == 0) timer_delay_usecs = 1; if (enable_standby_sig_alarm(timer_delay_secs, timer_delay_usecs, fin_time)) sig_alarm_enabled = true; else elog(FATAL, "could not set timer for process wakeup"); } } /* Wait to be signaled by UnpinBuffer() */ ProcWaitForSignal(); if (sig_alarm_enabled) { if (!disable_standby_sig_alarm()) elog(FATAL, "could not disable timer for process wakeup"); } }
/* * purge_dropped_db_segments */ static void purge_dropped_db_segments(bool force) { static TimestampTz last_purge_time = 0; List *db_oids; List *dbs_to_remove = NIL; HASH_SEQ_STATUS status; broker_db_meta *db_meta; if (!force && !TimestampDifferenceExceeds(last_purge_time, GetCurrentTimestamp(), 10 * 1000)) /* 10s */ return; db_oids = get_database_oids(); LWLockAcquire(IPCMessageBrokerIndexLock, LW_SHARED); hash_seq_init(&status, broker_meta->db_meta_hash); while ((db_meta = (broker_db_meta *) hash_seq_search(&status)) != NULL) { bool found = false; ListCell *lc; foreach(lc, db_oids) { if (lfirst_oid(lc) == db_meta->dbid) { found = true; break; } } if (!found) dbs_to_remove = lappend_oid(dbs_to_remove, db_meta->dbid); } LWLockRelease(IPCMessageBrokerIndexLock); if (list_length(dbs_to_remove)) { ListCell *lc; LWLockAcquire(IPCMessageBrokerIndexLock, LW_EXCLUSIVE); foreach(lc, dbs_to_remove) { Oid dbid = lfirst_oid(lc); bool found; db_meta = hash_search(broker_meta->db_meta_hash, &dbid, HASH_FIND, &found); Assert(found); Assert(db_meta->handle > 0); /* detach from main db segment */ if (db_meta->segment) dsm_detach(db_meta->segment); if (db_meta->lqueues) { int i; for (i = 0; i < continuous_query_num_workers; i++) { local_queue *local_buf = &db_meta->lqueues[i]; if (local_buf->slots) list_free_deep(local_buf->slots); } pfree(db_meta->lqueues); } hash_search(broker_meta->db_meta_hash, &dbid, HASH_REMOVE, &found); Assert(found); } mark_unused_locks_as_free(db_oids); LWLockRelease(IPCMessageBrokerIndexLock); }
/* * Send hot standby feedback message to primary, plus the current time, * in case they don't have a watch. */ static void XLogWalRcvSendHSFeedback(void) { char buf[sizeof(StandbyHSFeedbackMessage) + 1]; TimestampTz now; TransactionId nextXid; uint32 nextEpoch; TransactionId xmin; /* * If the user doesn't want status to be reported to the master, be sure * to exit before doing anything at all. */ if (wal_receiver_status_interval <= 0 || !hot_standby_feedback) return; /* Get current timestamp. */ now = GetCurrentTimestamp(); /* * Send feedback at most once per wal_receiver_status_interval. */ if (!TimestampDifferenceExceeds(feedback_message.sendTime, now, wal_receiver_status_interval * 1000)) return; /* * If Hot Standby is not yet active there is nothing to send. Check this * after the interval has expired to reduce number of calls. */ if (!HotStandbyActive()) return; /* * Make the expensive call to get the oldest xmin once we are certain * everything else has been checked. */ xmin = GetOldestXmin(true, false); /* * Get epoch and adjust if nextXid and oldestXmin are different sides of * the epoch boundary. */ GetNextXidAndEpoch(&nextXid, &nextEpoch); if (nextXid < xmin) nextEpoch--; /* * Always send feedback message. */ feedback_message.sendTime = now; feedback_message.xmin = xmin; feedback_message.epoch = nextEpoch; elog(DEBUG2, "sending hot standby feedback xmin %u epoch %u", feedback_message.xmin, feedback_message.epoch); /* Prepend with the message type and send it. */ buf[0] = 'h'; memcpy(&buf[1], &feedback_message, sizeof(StandbyHSFeedbackMessage)); walrcv_send(buf, sizeof(StandbyHSFeedbackMessage) + 1); }