/* * Detach from a barrier. If 'arrive' is true then also increment the phase * if there are no other participants. If there are other participants * waiting, then the phase will be advanced and they'll be released if they * were only waiting for the caller. Return true if this participant was the * last to detach. */ static inline bool BarrierDetachImpl(Barrier *barrier, bool arrive) { bool release; bool last; Assert(!barrier->static_party); SpinLockAcquire(&barrier->mutex); Assert(barrier->participants > 0); --barrier->participants; /* * If any other participants are waiting and we were the last participant * waited for, release them. If no other participants are waiting, but * this is a BarrierArriveAndDetach() call, then advance the phase too. */ if ((arrive || barrier->participants > 0) && barrier->arrived == barrier->participants) { release = true; barrier->arrived = 0; ++barrier->phase; } else release = false; last = barrier->participants == 0; SpinLockRelease(&barrier->mutex); if (release) ConditionVariableBroadcast(&barrier->condition_variable); return last; }
/* * _bt_parallel_done() -- Mark the parallel scan as complete. * * When there are no pages left to scan, this function should be called to * notify other workers. Otherwise, they might wait forever for the scan to * advance to the next page. */ void _bt_parallel_done(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; bool status_changed = false; /* Do nothing, for non-parallel scans */ if (parallel_scan == NULL) return; btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); /* * Mark the parallel scan as done for this combination of scan keys, * unless some other process already did so. See also * _bt_advance_array_keys. */ SpinLockAcquire(&btscan->btps_mutex); if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && btscan->btps_pageStatus != BTPARALLEL_DONE) { btscan->btps_pageStatus = BTPARALLEL_DONE; status_changed = true; } SpinLockRelease(&btscan->btps_mutex); /* wake up all the workers associated with this parallel scan */ if (status_changed) ConditionVariableBroadcast(&btscan->btps_cv); }
/* * Cleanup all temporary slots created in current session. */ void ReplicationSlotCleanup(void) { int i; Assert(MyReplicationSlot == NULL); restart: LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; if (!s->in_use) continue; SpinLockAcquire(&s->mutex); if (s->active_pid == MyProcPid) { Assert(s->data.persistency == RS_TEMPORARY); SpinLockRelease(&s->mutex); LWLockRelease(ReplicationSlotControlLock); /* avoid deadlock */ ReplicationSlotDropPtr(s); ConditionVariableBroadcast(&s->active_cv); goto restart; } else SpinLockRelease(&s->mutex); } LWLockRelease(ReplicationSlotControlLock); }
/* * BitmapDoneInitializingSharedState - Shared state is initialized * * By this time the leader has already populated the TBM and initialized the * shared state so wake up other processes. */ static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate) { SpinLockAcquire(&pstate->mutex); pstate->state = BM_FINISHED; SpinLockRelease(&pstate->mutex); ConditionVariableBroadcast(&pstate->cv); }
/* * Release the replication slot that this backend considers to own. * * This or another backend can re-acquire the slot later. * Resources this slot requires will be preserved. */ void ReplicationSlotRelease(void) { ReplicationSlot *slot = MyReplicationSlot; Assert(slot != NULL && slot->active_pid != 0); if (slot->data.persistency == RS_EPHEMERAL) { /* * Delete the slot. There is no !PANIC case where this is allowed to * fail, all that may happen is an incomplete cleanup of the on-disk * data. */ ReplicationSlotDropAcquired(); } /* * If slot needed to temporarily restrain both data and catalog xmin to * create the catalog snapshot, remove that temporary constraint. * Snapshots can only be exported while the initial snapshot is still * acquired. */ if (!TransactionIdIsValid(slot->data.xmin) && TransactionIdIsValid(slot->effective_xmin)) { SpinLockAcquire(&slot->mutex); slot->effective_xmin = InvalidTransactionId; SpinLockRelease(&slot->mutex); ReplicationSlotsComputeRequiredXmin(false); } if (slot->data.persistency == RS_PERSISTENT) { /* * Mark persistent slot inactive. We're not freeing it, just * disconnecting, but wake up others that may be waiting for it. */ SpinLockAcquire(&slot->mutex); slot->active_pid = 0; SpinLockRelease(&slot->mutex); ConditionVariableBroadcast(&slot->active_cv); } MyReplicationSlot = NULL; /* might not have been set when we've been a plain slot */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); MyPgXact->vacuumFlags &= ~PROC_IN_LOGICAL_DECODING; LWLockRelease(ProcArrayLock); }
/* * Permanently drop the replication slot which will be released by the point * this function returns. */ static void ReplicationSlotDropPtr(ReplicationSlot *slot) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; /* * If some other backend ran this code concurrently with us, we might try * to delete a slot with a certain name while someone else was trying to * create a slot with the same name. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); /* Generate pathnames. */ sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the * slot inactive before bailing out. If we're dropping an ephemeral or a * temporary slot, we better never fail hard as the caller won't expect * the slot to survive and this might get called during error handling. */ if (rename(path, tmppath) == 0) { /* * We need to fsync() the directory we just renamed and its parent to * make sure that our changes are on disk in a crash-safe fashion. If * fsync() fails, we can't be sure whether the changes are on disk or * not. For now, we handle that by panicking; * StartupReplicationSlots() will try to straighten it out after * restart. */ START_CRIT_SECTION(); fsync_fname(tmppath, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); } else { bool fail_softly = slot->data.persistency != RS_PERSISTENT; SpinLockAcquire(&slot->mutex); slot->active_pid = 0; SpinLockRelease(&slot->mutex); /* wake up anyone waiting on this slot */ ConditionVariableBroadcast(&slot->active_cv); ereport(fail_softly ? WARNING : ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", path, tmppath))); } /* * The slot is definitely gone. Lock out concurrent scans of the array * long enough to kill it. It's OK to clear the active PID here without * grabbing the mutex because nobody else can be scanning the array here, * and nobody can be attached to this slot and thus access it without * scanning the array. * * Also wake up processes waiting for it. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); slot->active_pid = 0; slot->in_use = false; LWLockRelease(ReplicationSlotControlLock); ConditionVariableBroadcast(&slot->active_cv); /* * Slot is dead and doesn't prevent resource removal anymore, recompute * limits. */ ReplicationSlotsComputeRequiredXmin(false); ReplicationSlotsComputeRequiredLSN(); /* * If removing the directory fails, the worst thing that will happen is * that the user won't be able to create a new slot with the same name * until the next server restart. We warn about it, but that's all. */ if (!rmtree(tmppath, true)) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", tmppath))); /* * We release this at the very end, so that nobody starts trying to create * a slot while we're still cleaning up the detritus of the old one. */ LWLockRelease(ReplicationSlotAllocationLock); }
/* * Find a previously created slot and mark it as used by this backend. */ void ReplicationSlotAcquire(const char *name, bool nowait) { ReplicationSlot *slot; int active_pid; int i; retry: Assert(MyReplicationSlot == NULL); /* * Search for the named slot and mark it active if we find it. If the * slot is already active, we exit the loop with active_pid set to the PID * of the backend that owns it. */ active_pid = 0; slot = NULL; LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) { /* * This is the slot we want. We don't know yet if it's active, so * get ready to sleep on it in case it is. (We may end up not * sleeping, but we don't want to do this while holding the * spinlock.) */ ConditionVariablePrepareToSleep(&s->active_cv); SpinLockAcquire(&s->mutex); active_pid = s->active_pid; if (active_pid == 0) active_pid = s->active_pid = MyProcPid; SpinLockRelease(&s->mutex); slot = s; break; } } LWLockRelease(ReplicationSlotControlLock); /* If we did not find the slot, error out. */ if (slot == NULL) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("replication slot \"%s\" does not exist", name))); /* * If we found the slot but it's already active in another backend, we * either error out or retry after a short wait, as caller specified. */ if (active_pid != MyProcPid) { if (nowait) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("replication slot \"%s\" is active for PID %d", name, active_pid))); /* Wait here until we get signaled, and then restart */ ConditionVariableSleep(&slot->active_cv, WAIT_EVENT_REPLICATION_SLOT_DROP); ConditionVariableCancelSleep(); goto retry; } else ConditionVariableCancelSleep(); /* no sleep needed after all */ /* Let everybody know we've modified this slot */ ConditionVariableBroadcast(&slot->active_cv); /* We made this slot active, so it's ours now. */ MyReplicationSlot = slot; }
/* * Create a new replication slot and mark it as used by this backend. * * name: Name of the slot * db_specific: logical decoding is db specific; if the slot is going to * be used for that pass true, otherwise false. */ void ReplicationSlotCreate(const char *name, bool db_specific, ReplicationSlotPersistency persistency) { ReplicationSlot *slot = NULL; int i; Assert(MyReplicationSlot == NULL); ReplicationSlotValidateName(name, ERROR); /* * If some other backend ran this code concurrently with us, we'd likely * both allocate the same slot, and that would be bad. We'd also be at * risk of missing a name collision. Also, we don't want to try to create * a new slot while somebody's busy cleaning up an old one, because we * might both be monkeying with the same directory. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); /* * Check for name collision, and identify an allocatable slot. We need to * hold ReplicationSlotControlLock in shared mode for this, so that nobody * else can change the in_use flags while we're looking at them. */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("replication slot \"%s\" already exists", name))); if (!s->in_use && slot == NULL) slot = s; } LWLockRelease(ReplicationSlotControlLock); /* If all slots are in use, we're out of luck. */ if (slot == NULL) ereport(ERROR, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), errmsg("all replication slots are in use"), errhint("Free one or increase max_replication_slots."))); /* * Since this slot is not in use, nobody should be looking at any part of * it other than the in_use field unless they're trying to allocate it. * And since we hold ReplicationSlotAllocationLock, nobody except us can * be doing that. So it's safe to initialize the slot. */ Assert(!slot->in_use); Assert(slot->active_pid == 0); /* first initialize persistent data */ memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData)); StrNCpy(NameStr(slot->data.name), name, NAMEDATALEN); slot->data.database = db_specific ? MyDatabaseId : InvalidOid; slot->data.persistency = persistency; /* and then data only present in shared memory */ slot->just_dirtied = false; slot->dirty = false; slot->effective_xmin = InvalidTransactionId; slot->effective_catalog_xmin = InvalidTransactionId; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; /* * Create the slot on disk. We haven't actually marked the slot allocated * yet, so no special cleanup is required if this errors out. */ CreateSlotOnDisk(slot); /* * We need to briefly prevent any other backend from iterating over the * slots while we flip the in_use flag. We also need to set the active * flag while holding the ControlLock as otherwise a concurrent * SlotAcquire() could acquire the slot as well. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); slot->in_use = true; /* We can now mark the slot active, and that makes it our slot. */ SpinLockAcquire(&slot->mutex); Assert(slot->active_pid == 0); slot->active_pid = MyProcPid; SpinLockRelease(&slot->mutex); MyReplicationSlot = slot; LWLockRelease(ReplicationSlotControlLock); /* * Now that the slot has been marked as in_use and active, it's safe to * let somebody else try to allocate a slot. */ LWLockRelease(ReplicationSlotAllocationLock); /* Let everybody know we've modified this slot */ ConditionVariableBroadcast(&slot->active_cv); }
/* * Arrive at this barrier, wait for all other attached participants to arrive * too and then return. Increments the current phase. The caller must be * attached. * * While waiting, pg_stat_activity shows a wait_event_class and wait_event * controlled by the wait_event_info passed in, which should be a value from * one of the WaitEventXXX enums defined in pgstat.h. * * Return true in one arbitrarily chosen participant. Return false in all * others. The return code can be used to elect one participant to execute a * phase of work that must be done serially while other participants wait. */ bool BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info) { bool release = false; bool elected; int start_phase; int next_phase; SpinLockAcquire(&barrier->mutex); start_phase = barrier->phase; next_phase = start_phase + 1; ++barrier->arrived; if (barrier->arrived == barrier->participants) { release = true; barrier->arrived = 0; barrier->phase = next_phase; barrier->elected = next_phase; } SpinLockRelease(&barrier->mutex); /* * If we were the last expected participant to arrive, we can release our * peers and return true to indicate that this backend has been elected to * perform any serial work. */ if (release) { ConditionVariableBroadcast(&barrier->condition_variable); return true; } /* * Otherwise we have to wait for the last participant to arrive and * advance the phase. */ elected = false; ConditionVariablePrepareToSleep(&barrier->condition_variable); for (;;) { /* * We know that phase must either be start_phase, indicating that we * need to keep waiting, or next_phase, indicating that the last * participant that we were waiting for has either arrived or detached * so that the next phase has begun. The phase cannot advance any * further than that without this backend's participation, because * this backend is attached. */ SpinLockAcquire(&barrier->mutex); Assert(barrier->phase == start_phase || barrier->phase == next_phase); release = barrier->phase == next_phase; if (release && barrier->elected != next_phase) { /* * Usually the backend that arrives last and releases the other * backends is elected to return true (see above), so that it can * begin processing serial work while it has a CPU timeslice. * However, if the barrier advanced because someone detached, then * one of the backends that is awoken will need to be elected. */ barrier->elected = barrier->phase; elected = true; } SpinLockRelease(&barrier->mutex); if (release) break; ConditionVariableSleep(&barrier->condition_variable, wait_event_info); } ConditionVariableCancelSleep(); return elected; }