/* * emit a completed btree page, and release the working storage. */ static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) { // Fetch gp_persistent_relation_node information that will be added to XLOG record. RelationFetchGpRelationNodeForXLog(wstate->index); /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ RelationOpenSmgr(wstate->index); /* XLOG stuff */ if (wstate->btws_use_wal) { _bt_lognewpage(wstate->index, page, blkno); } else { /* Leave the page LSN zero if not WAL-logged, but set TLI anyway */ PageSetTLI(page, ThisTimeLineID); } /* * If we have to write pages nonsequentially, fill in the space with * zeroes until we come back and overwrite. This is not logically * necessary on standard Unix filesystems (unwritten space will read as * zeroes anyway), but it should help to avoid fragmentation. The dummy * pages aren't WAL-logged though. */ while (blkno > wstate->btws_pages_written) { if (!wstate->btws_zeropage) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); // -------- MirroredLock ---------- // UNDONE: Unfortunately, I think we write temp relations to the mirror... LWLockAcquire(MirroredLock, LW_SHARED); smgrextend(wstate->index->rd_smgr, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, true); LWLockRelease(MirroredLock); // -------- MirroredLock ---------- } // -------- MirroredLock ---------- // UNDONE: Unfortunately, I think we write temp relations to the mirror... LWLockAcquire(MirroredLock, LW_SHARED); /* * Now write the page. We say isTemp = true even if it's not a temp * index, because there's no need for smgr to schedule an fsync for this * write; we'll do it ourselves before ending the build. */ if (blkno == wstate->btws_pages_written) { /* extending the file... */ smgrextend(wstate->index->rd_smgr, blkno, (char *) page, true); wstate->btws_pages_written++; } else { /* overwriting a block we zero-filled before */ smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true); } LWLockRelease(MirroredLock); // -------- MirroredLock ---------- pfree(page); }
/* * Create a new___ replication slot and mark it as used by this backend. * * name: Name of the slot * db_specific: logical decoding is db specific; if the slot is going to * be used for that pass true, otherwise false. */ void ReplicationSlotCreate(const char *name, bool db_specific, ReplicationSlotPersistency persistency) { ReplicationSlot *slot = NULL; int i; Assert(MyReplicationSlot == NULL); ReplicationSlotValidateName(name, ERROR); /* * If some other backend ran this code currently with us, we'd likely both * allocate the same slot, and that would be bad. We'd also be at risk of * missing a name collision. Also, we don't want to try to create a new___ * slot while somebody's busy cleaning up an old one, because we might * both be monkeying with the same directory. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); /* * Check for name collision, and identify an allocatable slot. We need to * hold ReplicationSlotControlLock in shared mode for this, so that nobody * else can change the in_use flags while we're looking at them. */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("replication slot \"%s\" already exists", name))); if (!s->in_use && slot == NULL) slot = s; } LWLockRelease(ReplicationSlotControlLock); /* If all slots are in use, we're out of luck. */ if (slot == NULL) ereport(ERROR, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), errmsg("all replication slots are in use"), errhint("Free one or increase max_replication_slots."))); /* * Since this slot is not in use, nobody should be looking at any part of * it other than the in_use field unless they're trying to allocate it. * And since we hold ReplicationSlotAllocationLock, nobody except us can * be doing that. So it's safe to initialize the slot. */ Assert(!slot->in_use); Assert(slot->active_pid == 0); slot->data.persistency = persistency; slot->data.xmin = InvalidTransactionId; slot->effective_xmin = InvalidTransactionId; StrNCpy(NameStr(slot->data.name), name, NAMEDATALEN); slot->data.database = db_specific ? MyDatabaseId : InvalidOid; slot->data.restart_lsn = InvalidXLogRecPtr; /* * Create the slot on disk. We haven't actually marked the slot allocated * yet, so no special cleanup is required if this errors out. */ CreateSlotOnDisk(slot); /* * We need to briefly prevent any other backend from iterating over the * slots while we flip the in_use flag. We also need to set the active * flag while holding the ControlLock as otherwise a concurrent * SlotAcquire() could acquire the slot as well. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); slot->in_use = true; /* We can now mark the slot active, and that makes it our slot. */ { volatile ReplicationSlot *vslot = slot; SpinLockAcquire(&slot->mutex); Assert(vslot->active_pid == 0); vslot->active_pid = MyProcPid; SpinLockRelease(&slot->mutex); MyReplicationSlot = slot; } LWLockRelease(ReplicationSlotControlLock); /* * Now that the slot has been marked as in_use and in_active, it's safe to * let somebody else try to allocate a slot. */ LWLockRelease(ReplicationSlotAllocationLock); }
/* * Shared functionality between saving and creating a replication slot. */ static void SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) { char tmppath[MAXPGPATH]; char path[MAXPGPATH]; int fd; ReplicationSlotOnDisk cp; bool was_dirty; /* first check whether there's something to write out */ { volatile ReplicationSlot *vslot = slot; SpinLockAcquire(&vslot->mutex); was_dirty = vslot->dirty; vslot->just_dirtied = false; SpinLockRelease(&vslot->mutex); } /* and don't do anything if there's nothing to write */ if (!was_dirty) return; LWLockAcquire(slot->io_in_progress_lock, LW_EXCLUSIVE); /* silence valgrind :( */ memset(&cp, 0, sizeof(ReplicationSlotOnDisk)); sprintf(tmppath, "%s/state.tmp", dir); sprintf(path, "%s/state", dir); fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); return; } cp.magic = SLOT_MAGIC; INIT_CRC32C(cp.checksum); cp.version = SLOT_VERSION; cp.length = ReplicationSlotOnDiskV2Size; SpinLockAcquire(&slot->mutex); memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData)); SpinLockRelease(&slot->mutex); COMP_CRC32C(cp.checksum, (char *) (&cp) + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(cp.checksum); if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) { int save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); return; } /* fsync the temporary file */ if (pg_fsync(fd) != 0) { int save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); return; } CloseTransientFile(fd); /* rename to permanent file, fsync file and directory */ if (rename(tmppath, path) != 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); return; } /* Check CreateSlot() for the reasoning of using a crit. section. */ START_CRIT_SECTION(); fsync_fname(path, false); fsync_fname((char *) dir, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); /* * Successfully wrote, unset dirty bit, unless somebody dirtied again * already. */ { volatile ReplicationSlot *vslot = slot; SpinLockAcquire(&vslot->mutex); if (!vslot->just_dirtied) vslot->dirty = false; SpinLockRelease(&vslot->mutex); } LWLockRelease(slot->io_in_progress_lock); }
/* * StrategyGetBuffer * * Called by the bufmgr to get the next candidate buffer to use in * BufferAlloc(). The only hard requirement BufferAlloc() has is that * the selected buffer must not currently be pinned by anyone. * * strategy is a BufferAccessStrategy object, or NULL for default strategy. * * To ensure that no one else can pin the buffer before we do, we must * return the buffer with the buffer header spinlock still held. If * *lock_held is set on exit, we have returned with the BufFreelistLock * still held, as well; the caller must release that lock once the spinlock * is dropped. We do it that way because releasing the BufFreelistLock * might awaken other processes, and it would be bad to do the associated * kernel calls while holding the buffer header spinlock. */ volatile BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held) { volatile BufferDesc *buf; int trycounter; /* * If given a strategy object, see whether it can select a buffer. We * assume strategy objects don't need the BufFreelistLock. */ if (strategy != NULL) { buf = GetBufferFromRing(strategy); if (buf != NULL) { *lock_held = false; return buf; } } /* Nope, so lock the freelist */ *lock_held = true; LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); /* * We count buffer allocation requests so that the bgwriter can estimate * the rate of buffer consumption. Note that buffers recycled by a * strategy object are intentionally not counted here. */ StrategyControl->numBufferAllocs++; /* * Try to get a buffer from the freelist. Note that the freeNext fields * are considered to be protected by the BufFreelistLock not the * individual buffer spinlocks, so it's OK to manipulate them without * holding the spinlock. */ while (StrategyControl->firstFreeBuffer >= 0) { buf = &BufferDescriptors[StrategyControl->firstFreeBuffer]; Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); /* Unconditionally remove buffer from freelist */ StrategyControl->firstFreeBuffer = buf->freeNext; buf->freeNext = FREENEXT_NOT_IN_LIST; /* * If the buffer is pinned or has a nonzero usage_count, we cannot use * it; discard it and retry. (This can only happen if VACUUM put a * valid buffer in the freelist and then someone else used it before * we got to it. It's probably impossible altogether as of 8.3, but * we'd better check anyway.) */ LockBufHdr(buf); if (buf->refcount == 0 && buf->usage_count == 0) { if (strategy != NULL) AddBufferToRing(strategy, buf); return buf; } UnlockBufHdr(buf); } /* Nothing on the freelist, so run the "clock sweep" algorithm */ trycounter = NBuffers; for (;;) { buf = &BufferDescriptors[StrategyControl->nextVictimBuffer]; if (++StrategyControl->nextVictimBuffer >= NBuffers) { StrategyControl->nextVictimBuffer = 0; StrategyControl->completePasses++; } /* * If the buffer is pinned or has a nonzero usage_count, we cannot use * it; decrement the usage_count (unless pinned) and keep scanning. */ LockBufHdr(buf); if (buf->refcount == 0) { if (buf->usage_count > 0) { buf->usage_count--; trycounter = NBuffers; } else { /* Found a usable buffer */ if (strategy != NULL) AddBufferToRing(strategy, buf); return buf; } } else if (--trycounter == 0) { /* * We've scanned all the buffers without making any state changes, * so all the buffers are pinned (or were when we looked at them). * We could hope that someone will free one eventually, but it's * probably better to fail than to risk getting stuck in an * infinite loop. */ UnlockBufHdr(buf); elog(ERROR, "no unpinned buffers available"); } UnlockBufHdr(buf); } /* not reached */ return NULL; }
static void DtmXactCallback(XactEvent event, void *arg) { //XTM_INFO("%d: DtmXactCallbackevent=%d nextxid=%d\n", getpid(), event, DtmNextXid); switch (event) { case XACT_EVENT_START: //XTM_INFO("%d: normal=%d, initialized=%d, replication=%d, bgw=%d, vacuum=%d\n", // getpid(), IsNormalProcessingMode(), dtm->initialized, MMDoReplication, IsBackgroundWorker, IsAutoVacuumWorkerProcess()); if (IsNormalProcessingMode() && dtm->initialized && MMDoReplication && !am_walsender && !IsBackgroundWorker && !IsAutoVacuumWorkerProcess()) { MMBeginTransaction(); } break; #if 0 case XACT_EVENT_PRE_COMMIT: case XACT_EVENT_PARALLEL_PRE_COMMIT: { TransactionId xid = GetCurrentTransactionIdIfAny(); if (!MMIsDistributedTrans && TransactionIdIsValid(xid)) { XTM_INFO("%d: Will ignore transaction %u\n", getpid(), xid); MMMarkTransAsLocal(xid); } break; } #endif case XACT_EVENT_COMMIT: case XACT_EVENT_ABORT: if (TransactionIdIsValid(DtmNextXid)) { if (!DtmVoted) { ArbiterSetTransStatus(DtmNextXid, TRANSACTION_STATUS_ABORTED, false); } if (event == XACT_EVENT_COMMIT) { /* * Now transaction status is already written in CLOG, * so we can remove information about it from hash table */ LWLockAcquire(dtm->hashLock, LW_EXCLUSIVE); hash_search(xid_in_doubt, &DtmNextXid, HASH_REMOVE, NULL); LWLockRelease(dtm->hashLock); } #if 0 /* should be handled now using DtmVoted flag */ else { /* * Transaction at the node can be aborted because of transaction failure at some other node * before it starts doing anything and assigned Xid, in this case Postgres is not calling SetTransactionStatus, * so we have to send report to DTMD here */ if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny())) { XTM_INFO("%d: abort transation on DTMD\n", getpid()); ArbiterSetTransStatus(DtmNextXid, TRANSACTION_STATUS_ABORTED, false); } } #endif DtmNextXid = InvalidTransactionId; DtmLastSnapshot = NULL; } MMIsDistributedTrans = false; break; default: break; } }
/* * ShmemInitStruct -- Create/attach to a structure in shared memory. * * This is called during initialization to find or allocate * a data structure in shared memory. If no other process * has created the structure, this routine allocates space * for it. If it exists already, a pointer to the existing * structure is returned. * * Returns: pointer to the object. *foundPtr is set TRUE if the object was * already in the shmem index (hence, already initialized). * * Note: before Postgres 9.0, this function returned NULL for some failure * cases. Now, it always throws error instead, so callers need not check * for NULL. */ void * ShmemInitStruct(const char *name, Size size, bool *foundPtr) { ShmemIndexEnt *result; void *structPtr; LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); if (!ShmemIndex) { PGShmemHeader *shmemseghdr = ShmemSegHdr; /* Must be trying to create/attach to ShmemIndex itself */ Assert(strcmp(name, "ShmemIndex") == 0); if (IsUnderPostmaster) { /* Must be initializing a (non-standalone) backend */ Assert(shmemseghdr->index != NULL); structPtr = shmemseghdr->index; *foundPtr = TRUE; } else { /* * If the shmem index doesn't exist, we are bootstrapping: we must * be trying to init the shmem index itself. * * Notice that the ShmemIndexLock is released before the shmem * index has been initialized. This should be OK because no other * process can be accessing shared memory yet. */ Assert(shmemseghdr->index == NULL); structPtr = ShmemAlloc(size); if (structPtr == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("not enough shared memory for data structure" " \"%s\" (%zu bytes requested)", name, size))); shmemseghdr->index = structPtr; *foundPtr = FALSE; } LWLockRelease(ShmemIndexLock); return structPtr; } /* look it up in the shmem index */ result = (ShmemIndexEnt *) hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr); if (!result) { LWLockRelease(ShmemIndexLock); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("could not create ShmemIndex entry for data structure \"%s\"", name))); } if (*foundPtr) { /* * Structure is in the shmem index so someone else has allocated it * already. The size better be the same as the size we are trying to * initialize to, or there is a name conflict (or worse). */ if (result->size != size) { LWLockRelease(ShmemIndexLock); ereport(ERROR, (errmsg("ShmemIndex entry size is wrong for data structure" " \"%s\": expected %zu, actual %zu", name, size, result->size))); } structPtr = result->location; } else { /* It isn't in the table yet. allocate and initialize it */ structPtr = ShmemAlloc(size); if (structPtr == NULL) { /* out of memory; remove the failed ShmemIndex entry */ hash_search(ShmemIndex, name, HASH_REMOVE, NULL); LWLockRelease(ShmemIndexLock); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("not enough shared memory for data structure" " \"%s\" (%zu bytes requested)", name, size))); } result->size = size; result->location = structPtr; } LWLockRelease(ShmemIndexLock); Assert(ShmemAddrIsValid(structPtr)); return structPtr; }
/* * CheckDeadLock * * We only get to this routine if the DEADLOCK_TIMEOUT fired * while waiting for a lock to be released by some other process. Look * to see if there's a deadlock; if not, just return and continue waiting. * (But signal ProcSleep to log a message, if log_lock_waits is true.) * If we have a real deadlock, remove ourselves from the lock's wait queue * and signal an error to ProcSleep. * * NB: this is run inside a signal handler, so be very wary about what is done * here or in called routines. */ void CheckDeadLock(void) { int i; /* * Acquire exclusive lock on the entire shared lock data structures. Must * grab LWLocks in partition-number order to avoid LWLock deadlock. * * Note that the deadlock check interrupt had better not be enabled * anywhere that this process itself holds lock partition locks, else this * will wait forever. Also note that LWLockAcquire creates a critical * section, so that this routine cannot be interrupted by cancel/die * interrupts. */ for (i = 0; i < NUM_LOCK_PARTITIONS; i++) LWLockAcquire(FirstLockMgrLock + i, LW_EXCLUSIVE); /* * Check to see if we've been awoken by anyone in the interim. * * If we have, we can return and resume our transaction -- happy day. * Before we are awoken the process releasing the lock grants it to us so * we know that we don't have to wait anymore. * * We check by looking to see if we've been unlinked from the wait queue. * This is quicker than checking our semaphore's state, since no kernel * call is needed, and it is safe because we hold the lock partition lock. */ if (MyProc->links.prev == NULL || MyProc->links.next == NULL) goto check_done; #ifdef LOCK_DEBUG if (Debug_deadlocks) DumpAllLocks(); #endif /* Run the deadlock check, and set deadlock_state for use by ProcSleep */ deadlock_state = DeadLockCheck(MyProc); if (deadlock_state == DS_HARD_DEADLOCK) { /* * Oops. We have a deadlock. * * Get this process out of wait state. (Note: we could do this more * efficiently by relying on lockAwaited, but use this coding to * preserve the flexibility to kill some other transaction than the * one detecting the deadlock.) * * RemoveFromWaitQueue sets MyProc->waitStatus to STATUS_ERROR, so * ProcSleep will report an error after we return from the signal * handler. */ Assert(MyProc->waitLock != NULL); RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag))); /* * Unlock my semaphore so that the interrupted ProcSleep() call can * finish. */ PGSemaphoreUnlock(&MyProc->sem); /* * We're done here. Transaction abort caused by the error that * ProcSleep will raise will cause any other locks we hold to be * released, thus allowing other processes to wake up; we don't need * to do that here. NOTE: an exception is that releasing locks we * hold doesn't consider the possibility of waiters that were blocked * behind us on the lock we just failed to get, and might now be * wakable because we're not in front of them anymore. However, * RemoveFromWaitQueue took care of waking up any such processes. */ } else if (log_lock_waits || deadlock_state == DS_BLOCKED_BY_AUTOVACUUM) { /* * Unlock my semaphore so that the interrupted ProcSleep() call can * print the log message (we daren't do it here because we are inside * a signal handler). It will then sleep again until someone releases * the lock. * * If blocked by autovacuum, this wakeup will enable ProcSleep to send * the canceling signal to the autovacuum worker. */ PGSemaphoreUnlock(&MyProc->sem); } /* * And release locks. We do this in reverse order for two reasons: (1) * Anyone else who needs more than one of the locks will be trying to lock * them in increasing order; we don't want to release the other process * until it can get all the locks it needs. (2) This avoids O(N^2) * behavior inside LWLockRelease. */ check_done: for (i = NUM_LOCK_PARTITIONS; --i >= 0;) LWLockRelease(FirstLockMgrLock + i); }
/* * Write a page from a shared buffer, if necessary. * Does nothing if the specified slot is not dirty. * * NOTE: only one write attempt is made here. Hence, it is possible that * the page is still dirty at exit (if someone else re-dirtied it during * the write). However, we *do* attempt a fresh write even if the page * is already being written; this is for checkpoints. * * Control lock must be held at entry, and will be held at exit. */ void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int pageno = shared->page_number[slotno]; bool ok; /* If a write is in progress, wait for it to finish */ while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && shared->page_number[slotno] == pageno) { SimpleLruWaitIO(ctl, slotno); } /* * Do nothing if page is not dirty, or if buffer no longer contains the * same page we were called for. */ if (!shared->page_dirty[slotno] || shared->page_status[slotno] != SLRU_PAGE_VALID || shared->page_number[slotno] != pageno) return; /* * Mark the slot write-busy, and clear the dirtybit. After this point, a * transaction status update on this page will mark it dirty again. */ shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; shared->page_dirty[slotno] = false; /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE); /* Release control lock while doing I/O */ LWLockRelease(shared->ControlLock); /* Do the write */ ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); /* If we failed, and we're in a flush, better close the files */ if (!ok && fdata) { int i; for (i = 0; i < fdata->num_files; i++) MirroredFlatFile_Close(&fdata->mirroredOpens[i]); } /* Re-acquire control lock and update page state */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); /* If we failed to write, mark the page dirty again */ if (!ok) shared->page_dirty[slotno] = true; shared->page_status[slotno] = SLRU_PAGE_VALID; LWLockRelease(shared->buffer_locks[slotno]); /* Now it's okay to ereport if we failed */ if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); }
/* * Drop a table space * * Be careful to check that the tablespace is empty. */ void RemoveTableSpace(List *names, DropBehavior behavior, bool missing_ok) { char *tablespacename; Relation rel; HeapTuple tuple; cqContext cqc; cqContext *pcqCtx; Oid tablespaceoid; int32 count; RelFileNode relfilenode; DbDirNode dbDirNode; PersistentFileSysState persistentState; ItemPointerData persistentTid; int64 persistentSerialNum; /* don't call this in a transaction block */ // PreventTransactionChain((void *) stmt, "DROP TABLESPACE"); /* * General DROP (object) syntax allows fully qualified names, but * tablespaces are global objects that do not live in schemas, so * it is a syntax error if a fully qualified name was given. */ if (list_length(names) != 1) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("tablespace name may not be qualified"))); tablespacename = strVal(linitial(names)); /* Disallow CASCADE */ if (behavior == DROP_CASCADE) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax at or near \"cascade\""))); /* * Find the target tuple */ rel = heap_open(TableSpaceRelationId, RowExclusiveLock); pcqCtx = caql_addrel(cqclr(&cqc), rel); tuple = caql_getfirst( pcqCtx, cql("SELECT * FROM pg_tablespace " " WHERE spcname = :1 " " FOR UPDATE ", CStringGetDatum(tablespacename))); if (!HeapTupleIsValid(tuple)) { /* No such tablespace, no need to hold the lock */ heap_close(rel, RowExclusiveLock); if (!missing_ok) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("tablespace \"%s\" does not exist", tablespacename))); } else { ereport(NOTICE, (errmsg("tablespace \"%s\" does not exist, skipping", tablespacename))); } return; } tablespaceoid = HeapTupleGetOid(tuple); /* Must be tablespace owner */ if (!pg_tablespace_ownercheck(tablespaceoid, GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_TABLESPACE, tablespacename); /* Disallow drop of the standard tablespaces, even by superuser */ if (tablespaceoid == GLOBALTABLESPACE_OID || tablespaceoid == DEFAULTTABLESPACE_OID) aclcheck_error(ACLCHECK_NO_PRIV, ACL_KIND_TABLESPACE, tablespacename); /* * Check for any databases or relations defined in this tablespace, this * is logically the same as checkSharedDependencies, however we don't * actually track these in pg_shdepend, instead we lookup this information * in the gp_persistent_database/relation_node tables. */ /* ... */ /* * Remove the pg_tablespace tuple (this will roll back if we fail below) */ caql_delete_current(pcqCtx); /* * Remove any comments on this tablespace. */ DeleteSharedComments(tablespaceoid, TableSpaceRelationId); /* * Remove dependency on owner. * * If shared dependencies are added between filespace <=> tablespace * they will be deleted as well. */ deleteSharedDependencyRecordsFor(TableSpaceRelationId, tablespaceoid); /* MPP-6929: metadata tracking */ if (Gp_role == GP_ROLE_DISPATCH) MetaTrackDropObject(TableSpaceRelationId, tablespaceoid); /* * Acquire TablespaceCreateLock to ensure that no * MirroredFileSysObj_JustInTimeDbDirCreate is running concurrently. */ LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* * Check for any relations still defined in the tablespace. */ PersistentRelation_CheckTablespace(tablespaceoid, &count, &relfilenode); if (count > 0) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("tablespace \"%s\" is not empty", tablespacename))); } /* * Schedule the removal the physical infrastructure. * * Note: This only schedules the delete, the delete won't actually occur * until after the transaction has comitted. This should however do * everything it can to assure that the delete will occur sucessfully, * e.g. check permissions etc. */ /* * Schedule all persistent database directory removals for transaction commit. */ PersistentDatabase_DirIterateInit(); while (PersistentDatabase_DirIterateNext( &dbDirNode, &persistentState, &persistentTid, &persistentSerialNum)) { if (dbDirNode.tablespace != tablespaceoid) continue; /* * Database directory objects can linger in 'Drop Pending' state, etc, * when the mirror is down and needs drop work. So only pay attention * to 'Created' objects. */ if (persistentState != PersistentFileSysState_Created) continue; MirroredFileSysObj_ScheduleDropDbDir( &dbDirNode, &persistentTid, persistentSerialNum); } /* * Now schedule the tablespace directory removal. */ MirroredFileSysObj_ScheduleDropTablespaceDir(tablespaceoid); /* * Note: because we checked that the tablespace was empty, there should be * no need to worry about flushing shared buffers or free space map * entries for relations in the tablespace. * * CHECK THIS, also check if the lock makes any sense in this context. */ /* * Allow MirroredFileSysObj_JustInTimeDbDirCreate again. */ LWLockRelease(TablespaceCreateLock); /* We keep the lock on the row in pg_tablespace until commit */ heap_close(rel, NoLock); /* Note: no need for dispatch, that is handled in utility.c */ return; }
/* * Remove all segments before the one holding the passed page number */ static void SimpleLruTruncate_internal(SlruCtl ctl, int cutoffPage, bool lockHeld) { SlruShared shared = ctl->shared; int slotno; /* * The cutoff point is the start of the segment containing cutoffPage. */ cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT; /* * Scan shared memory and remove any pages preceding the cutoff page, to * ensure we won't rewrite them later. (Since this is normally called in * or just after a checkpoint, any dirty pages should have been flushed * already ... we're just being extra careful here.) */ if (!lockHeld) LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); restart:; /* * While we are holding the lock, make an important safety check: the * planned cutoff point must be <= the current endpoint page. Otherwise we * have already wrapped around, and proceeding with the truncation would * risk removing the current segment. */ if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage)) { if (!lockHeld) LWLockRelease(shared->ControlLock); ereport(LOG, (errmsg("could not truncate directory \"%s\": apparent wraparound", ctl->Dir))); return; } for (slotno = 0; slotno < shared->num_slots; slotno++) { if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) continue; if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage)) continue; /* * If page is clean, just change state to EMPTY (expected case). */ if (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno]) { shared->page_status[slotno] = SLRU_PAGE_EMPTY; continue; } /* * Hmm, we have (or may have) I/O operations acting on the page, so * we've got to wait for them to finish and then start again. This is * the same logic as in SlruSelectLRUPage. (XXX if page is dirty, * wouldn't it be OK to just discard it without writing it? For now, * keep the logic the same as it was.) */ if (shared->page_status[slotno] == SLRU_PAGE_VALID) SimpleLruWritePage(ctl, slotno, NULL); else SimpleLruWaitIO(ctl, slotno); goto restart; } if (!lockHeld) LWLockRelease(shared->ControlLock); /* Now we can remove the old segment(s) */ (void) SlruScanDirectory(ctl, cutoffPage, true); }
/* * Find a page in a shared buffer, reading it in if necessary. * The page number must correspond to an already-initialized page. * * The passed-in xid is used only for error reporting, and may be * InvalidTransactionId if no specific xid is associated with the action. * * If the passed in pointer to valid is NULL, then log errors can be * generated by this function. If valid is not NULL, then the function * will not generate log errors, but will set the boolean value * pointed to by valid to TRUE if it was able to read the page, * or FALSE if the page read had error. * * Return value is the shared-buffer slot number now holding the page. * The buffer's LRU access info is updated. * * Control lock must be held at entry, and will be held at exit. */ static int SimpleLruReadPage_Internal(SlruCtl ctl, int pageno, TransactionId xid, bool *valid) { SlruShared shared = ctl->shared; /* Outer loop handles restart if we must wait for someone else's I/O */ for (;;) { int slotno; bool ok; /* See if page already is in memory; if not, pick victim slot */ slotno = SlruSelectLRUPage(ctl, pageno); /* Did we find the page in memory? */ if (shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_EMPTY) { /* If page is still being read in, we must wait for I/O */ if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS) { SimpleLruWaitIO(ctl, slotno); /* Now we must recheck state from the top */ continue; } /* Otherwise, it's ready to use */ SlruRecentlyUsed(shared, slotno); if (valid != NULL) *valid = true; return slotno; } /* We found no match; assert we selected a freeable slot */ Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno])); /* Mark the slot read-busy */ shared->page_number[slotno] = pageno; shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS; shared->page_dirty[slotno] = false; /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE); /* * Temporarily mark page as recently-used to discourage * SlruSelectLRUPage from selecting it again for someone else. */ SlruRecentlyUsed(shared, slotno); /* Release control lock while doing I/O */ LWLockRelease(shared->ControlLock); /* Do the read */ ok = SlruPhysicalReadPage(ctl, pageno, slotno); /* Re-acquire control lock and update page state */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS && !shared->page_dirty[slotno]); shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY; LWLockRelease(shared->buffer_locks[slotno]); /* Now it's okay to ereport if we failed */ if (!ok && valid == NULL) SlruReportIOError(ctl, pageno, xid); else if (valid != NULL) { if (!ok) { LWLockRelease(shared->ControlLock); *valid = false; return -1; } else *valid = true; } SlruRecentlyUsed(shared, slotno); return slotno; } }
/* * Select the slot to re-use when we need a free slot. * * The target page number is passed because we need to consider the * possibility that some other process reads in the target page while * we are doing I/O to free a slot. Hence, check or recheck to see if * any slot already holds the target page, and return that slot if so. * Thus, the returned slot is *either* a slot already holding the pageno * (could be any state except EMPTY), *or* a freeable slot (state EMPTY * or CLEAN). * * Control lock must be held at entry, and will be held at exit. */ static int SlruSelectLRUPage(SlruCtl ctl, int pageno) { SlruShared shared = ctl->shared; /* Outer loop handles restart after I/O */ for (;;) { int slotno; int bestslot = 0; unsigned int bestcount = 0; /* See if page already has a buffer assigned */ for (slotno = 0; slotno < NUM_SLRU_BUFFERS; slotno++) { if (shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_EMPTY) return slotno; } /* * If we find any EMPTY slot, just select that one. Else locate the * least-recently-used slot that isn't the latest page. */ for (slotno = 0; slotno < NUM_SLRU_BUFFERS; slotno++) { if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) return slotno; if (shared->page_lru_count[slotno] > bestcount && shared->page_number[slotno] != shared->latest_page_number) { bestslot = slotno; bestcount = shared->page_lru_count[slotno]; } } /* * If the selected page is clean, we're set. */ if (shared->page_status[bestslot] == SLRU_PAGE_CLEAN) return bestslot; /* * We need to do I/O. Normal case is that we have to write it out, * but it's possible in the worst case to have selected a read-busy * page. In that case we just wait for someone else to complete the * I/O, which we can do by waiting for the per-buffer lock. */ if (shared->page_status[bestslot] == SLRU_PAGE_READ_IN_PROGRESS) { LWLockRelease(shared->ControlLock); LWLockAcquire(shared->buffer_locks[bestslot], LW_SHARED); LWLockRelease(shared->buffer_locks[bestslot]); LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); } else SimpleLruWritePage(ctl, bestslot, NULL); /* * Now loop back and try again. This is the easiest way of dealing * with corner cases such as the victim page being re-dirtied while we * wrote it. */ } }
/* * Write a page from a shared buffer, if necessary. * Does nothing if the specified slot is not dirty. * * NOTE: only one write attempt is made here. Hence, it is possible that * the page is still dirty at exit (if someone else re-dirtied it during * the write). However, we *do* attempt a fresh write even if the page * is already being written; this is for checkpoints. * * Control lock must be held at entry, and will be held at exit. */ void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int pageno; bool ok; /* Do nothing if page does not need writing */ if (shared->page_status[slotno] != SLRU_PAGE_DIRTY && shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS) return; pageno = shared->page_number[slotno]; /* * We must grab the per-buffer lock to do I/O. To avoid deadlock, must * release ControlLock while waiting for per-buffer lock. Fortunately, * most of the time the per-buffer lock shouldn't be already held, so we * can do this: */ if (!LWLockConditionalAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE)) { LWLockRelease(shared->ControlLock); LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE); LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); } /* * Check to see if someone else already did the write, or took the buffer * away from us. If so, do nothing. NOTE: we really should never see * WRITE_IN_PROGRESS here, since that state should only occur while the * writer is holding the buffer lock. But accept it so that we have a * recovery path if a writer aborts. */ if (shared->page_number[slotno] != pageno || (shared->page_status[slotno] != SLRU_PAGE_DIRTY && shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS)) { LWLockRelease(shared->buffer_locks[slotno]); return; } /* * Mark the slot write-busy. After this point, a transaction status * update on this page will mark it dirty again. */ shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; /* Okay, release the control lock and do the write */ LWLockRelease(shared->ControlLock); ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); /* If we failed, and we're in a flush, better close the files */ if (!ok && fdata) { int i; for (i = 0; i < fdata->num_files; i++) close(fdata->fd[i]); } /* Re-acquire shared control lock and update page state */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS || shared->page_status[slotno] == SLRU_PAGE_DIRTY)); /* Cannot set CLEAN if someone re-dirtied page since write started */ if (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS) shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_DIRTY; LWLockRelease(shared->buffer_locks[slotno]); /* Now it's okay to ereport if we failed */ if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); }
/* * Find a page in a shared buffer, reading it in if necessary. * The page number must correspond to an already-initialized page. * * The passed-in xid is used only for error reporting, and may be * InvalidTransactionId if no specific xid is associated with the action. * * Return value is the shared-buffer slot number now holding the page. * The buffer's LRU access info is updated. * * Control lock must be held at entry, and will be held at exit. */ int SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid) { SlruShared shared = ctl->shared; /* Outer loop handles restart if we lose the buffer to someone else */ for (;;) { int slotno; bool ok; /* See if page already is in memory; if not, pick victim slot */ slotno = SlruSelectLRUPage(ctl, pageno); /* Did we find the page in memory? */ if (shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_EMPTY) { /* If page is still being read in, we cannot use it yet */ if (shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) { /* otherwise, it's ready to use */ SlruRecentlyUsed(shared, slotno); return slotno; } } else { /* We found no match; assert we selected a freeable slot */ Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || shared->page_status[slotno] == SLRU_PAGE_CLEAN); } /* Mark the slot read-busy (no-op if it already was) */ shared->page_number[slotno] = pageno; shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS; /* * Temporarily mark page as recently-used to discourage * SlruSelectLRUPage from selecting it again for someone else. */ SlruRecentlyUsed(shared, slotno); /* * We must grab the per-buffer lock to do I/O. To avoid deadlock, * must release ControlLock while waiting for per-buffer lock. * Fortunately, most of the time the per-buffer lock shouldn't be * already held, so we can do this: */ if (!LWLockConditionalAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE)) { LWLockRelease(shared->ControlLock); LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE); LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); } /* * Check to see if someone else already did the read, or took the * buffer away from us. If so, restart from the top. */ if (shared->page_number[slotno] != pageno || shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) { LWLockRelease(shared->buffer_locks[slotno]); continue; } /* Okay, release control lock and do the read */ LWLockRelease(shared->ControlLock); ok = SlruPhysicalReadPage(ctl, pageno, slotno); /* Re-acquire shared control lock and update page state */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS); shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_EMPTY; LWLockRelease(shared->buffer_locks[slotno]); /* Now it's okay to ereport if we failed */ if (!ok) SlruReportIOError(ctl, pageno, xid); SlruRecentlyUsed(shared, slotno); return slotno; } }
/* * Each database using a table space is isolated into its own name space * by a subdirectory named for the database OID. On first creation of an * object in the tablespace, create the subdirectory. If the subdirectory * already exists, fall through quietly. * * isRedo indicates that we are creating an object during WAL replay. * In this case we will cope with the possibility of the tablespace * directory not being there either --- this could happen if we are * replaying an operation on a table in a subsequently-dropped tablespace. * We handle this by making a directory in the place where the tablespace * symlink would normally be. This isn't an exact replay of course, but * it's the best we can do given the available information. * * If tablespaces are not supported, we still need it in case we have to * re-create a database subdirectory (of $PGDATA/base) during WAL replay. */ void TablespaceCreateDbspace(Oid spcNode, Oid dbNode, bool isRedo) { struct stat st; char *dir; /* * The global tablespace doesn't have per-database subdirectories, so * nothing to do for it. */ if (spcNode == GLOBALTABLESPACE_OID) return; Assert(OidIsValid(spcNode)); Assert(OidIsValid(dbNode)); dir = GetDatabasePath(dbNode, spcNode); if (stat(dir, &st) < 0) { /* Directory does not exist? */ if (errno == ENOENT) { /* * Acquire TablespaceCreateLock to ensure that no DROP TABLESPACE * or TablespaceCreateDbspace is running concurrently. */ LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* * Recheck to see if someone created the directory while we were * waiting for lock. */ if (stat(dir, &st) == 0 && S_ISDIR(st.st_mode)) { /* Directory was created */ } else { /* Directory creation failed? */ if (mkdir(dir, S_IRWXU) < 0) { char *parentdir; /* Failure other than not exists or not in WAL replay? */ if (errno != ENOENT || !isRedo) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", dir))); /* * Parent directories are missing during WAL replay, so * continue by creating simple parent directories rather * than a symlink. */ /* create two parents up if not exist */ parentdir = pstrdup(dir); get_parent_directory(parentdir); get_parent_directory(parentdir); /* Can't create parent and it doesn't already exist? */ if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", parentdir))); pfree(parentdir); /* create one parent up if not exist */ parentdir = pstrdup(dir); get_parent_directory(parentdir); /* Can't create parent and it doesn't already exist? */ if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", parentdir))); pfree(parentdir); /* Create database directory */ if (mkdir(dir, S_IRWXU) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", dir))); } } LWLockRelease(TablespaceCreateLock); } else { ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat directory \"%s\": %m", dir))); } } else { /* Is it not a directory? */ if (!S_ISDIR(st.st_mode)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" exists but is not a directory", dir))); } pfree(dir); }
/* * Update the LSNs on each queue based upon our latest state. This * implements a simple policy of first-valid-standby-releases-waiter. * * Other policies are possible, which would change what we do here and what * perhaps also which information we store as well. */ void SyncRepReleaseWaiters(void) { volatile WalSndCtlData *walsndctl = WalSndCtl; volatile WalSnd *syncWalSnd = NULL; int numprocs = 0; int priority = 0; int i; /* * If this WALSender is serving a standby that is not on the list of * potential standbys then we have nothing to do. If we are still starting * up or still running base backup, then leave quickly also. */ if (MyWalSnd->sync_standby_priority == 0 || MyWalSnd->state < WALSNDSTATE_STREAMING) return; /* * We're a potential sync standby. Release waiters if we are the highest * priority standby. If there are multiple standbys with same priorities * then we use the first mentioned standby. If you change this, also * change pg_stat_get_wal_senders(). */ LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); for (i = 0; i < max_wal_senders; i++) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = &walsndctl->walsnds[i]; if (walsnd->pid != 0 && walsnd->sync_standby_priority > 0 && (priority == 0 || priority > walsnd->sync_standby_priority)) { priority = walsnd->sync_standby_priority; syncWalSnd = walsnd; } } /* * We should have found ourselves at least. */ Assert(syncWalSnd); /* * If we aren't managing the highest priority standby then just leave. */ if (syncWalSnd != MyWalSnd) { LWLockRelease(SyncRepLock); announce_next_takeover = true; return; } if (XLByteLT(walsndctl->lsn, MyWalSnd->flush)) { /* * Set the lsn first so that when we wake backends they will release * up to this location. */ walsndctl->lsn = MyWalSnd->flush; numprocs = SyncRepWakeQueue(false); } LWLockRelease(SyncRepLock); elog(DEBUG3, "released %d procs up to %X/%X", numprocs, MyWalSnd->flush.xlogid, MyWalSnd->flush.xrecoff); /* * If we are managing the highest priority standby, though we weren't * prior to this, then announce we are now the sync standby. */ if (announce_next_takeover) { announce_next_takeover = false; ereport(LOG, (errmsg("standby \"%s\" is now the synchronous standby with priority %u", application_name, MyWalSnd->sync_standby_priority))); } }
/* * Drop a table space * * Be careful to check that the tablespace is empty. */ void DropTableSpace(DropTableSpaceStmt *stmt) { #ifdef HAVE_SYMLINK char *tablespacename = stmt->tablespacename; HeapScanDesc scandesc; Relation rel; HeapTuple tuple; ScanKeyData entry[1]; Oid tablespaceoid; /* * Find the target tuple */ rel = heap_open(TableSpaceRelationId, RowExclusiveLock); ScanKeyInit(&entry[0], Anum_pg_tablespace_spcname, BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(tablespacename)); scandesc = heap_beginscan(rel, SnapshotNow, 1, entry); tuple = heap_getnext(scandesc, ForwardScanDirection); if (!HeapTupleIsValid(tuple)) { if (!stmt->missing_ok) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("tablespace \"%s\" does not exist", tablespacename))); } else { ereport(NOTICE, (errmsg("tablespace \"%s\" does not exist, skipping", tablespacename))); /* XXX I assume I need one or both of these next two calls */ heap_endscan(scandesc); heap_close(rel, NoLock); } return; } tablespaceoid = HeapTupleGetOid(tuple); /* Must be tablespace owner */ if (!pg_tablespace_ownercheck(tablespaceoid, GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_TABLESPACE, tablespacename); /* Disallow drop of the standard tablespaces, even by superuser */ if (tablespaceoid == GLOBALTABLESPACE_OID || tablespaceoid == DEFAULTTABLESPACE_OID) aclcheck_error(ACLCHECK_NO_PRIV, ACL_KIND_TABLESPACE, tablespacename); /* * Remove the pg_tablespace tuple (this will roll back if we fail below) */ simple_heap_delete(rel, &tuple->t_self); heap_endscan(scandesc); /* * Remove any comments or security labels on this tablespace. */ DeleteSharedComments(tablespaceoid, TableSpaceRelationId); DeleteSharedSecurityLabel(tablespaceoid, TableSpaceRelationId); /* * Remove dependency on owner. */ deleteSharedDependencyRecordsFor(TableSpaceRelationId, tablespaceoid, 0); /* * Acquire TablespaceCreateLock to ensure that no TablespaceCreateDbspace * is running concurrently. */ LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* * Try to remove the physical infrastructure. */ if (!destroy_tablespace_directories(tablespaceoid, false)) { /* * Not all files deleted? However, there can be lingering empty files * in the directories, left behind by for example DROP TABLE, that * have been scheduled for deletion at next checkpoint (see comments * in mdunlink() for details). We could just delete them immediately, * but we can't tell them apart from important data files that we * mustn't delete. So instead, we force a checkpoint which will clean * out any lingering files, and try again. */ RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); if (!destroy_tablespace_directories(tablespaceoid, false)) { /* Still not empty, the files must be important then */ ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("tablespace \"%s\" is not empty", tablespacename))); } } /* Record the filesystem change in XLOG */ { xl_tblspc_drop_rec xlrec; XLogRecData rdata[1]; xlrec.ts_id = tablespaceoid; rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(xl_tblspc_drop_rec); rdata[0].buffer = InvalidBuffer; rdata[0].next = NULL; (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_DROP, rdata); } /* * Note: because we checked that the tablespace was empty, there should be * no need to worry about flushing shared buffers or free space map * entries for relations in the tablespace. */ /* * Force synchronous commit, to minimize the window between removing the * files on-disk and marking the transaction committed. It's not great * that there is any window at all, but definitely we don't want to make * it larger than necessary. */ ForceSyncCommit(); /* * Allow TablespaceCreateDbspace again. */ LWLockRelease(TablespaceCreateLock); /* We keep the lock on pg_tablespace until commit */ heap_close(rel, NoLock); #else /* !HAVE_SYMLINK */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif /* HAVE_SYMLINK */ }
/* * Wait for synchronous replication, if requested by user. * * Initially backends start in state SYNC_REP_NOT_WAITING and then * change that state to SYNC_REP_WAITING before adding ourselves * to the wait queue. During SyncRepWakeQueue() a WALSender changes * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed. * This backend then resets its state to SYNC_REP_NOT_WAITING. */ void SyncRepWaitForLSN(XLogRecPtr XactCommitLSN) { char *new_status = NULL; const char *old_status; /* * Fast exit if user has not requested sync replication, or there are no * sync replication standby names defined. Note that those standbys don't * need to be connected. */ if (!SyncRepRequested() || !SyncStandbysDefined()) return; Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); Assert(WalSndCtl != NULL); LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING); /* * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not * set. See SyncRepUpdateSyncStandbysDefined. * * Also check that the standby hasn't already replied. Unlikely race * condition but we'll be fetching that cache line anyway so its likely to * be a low cost check. */ if (!WalSndCtl->sync_standbys_defined || XLByteLE(XactCommitLSN, WalSndCtl->lsn)) { LWLockRelease(SyncRepLock); return; } /* * Set our waitLSN so WALSender will know when to wake us, and add * ourselves to the queue. */ MyProc->waitLSN = XactCommitLSN; MyProc->syncRepState = SYNC_REP_WAITING; SyncRepQueueInsert(); Assert(SyncRepQueueIsOrderedByLSN()); LWLockRelease(SyncRepLock); /* Alter ps display to show waiting for sync rep. */ if (update_process_title) { int len; old_status = get_ps_display(&len); new_status = (char *) palloc(len + 32 + 1); memcpy(new_status, old_status, len); sprintf(new_status + len, " waiting for %X/%X", XactCommitLSN.xlogid, XactCommitLSN.xrecoff); set_ps_display(new_status, false); new_status[len] = '\0'; /* truncate off " waiting ..." */ } /* * Wait for specified LSN to be confirmed. * * Each proc has its own wait latch, so we perform a normal latch * check/wait loop here. */ for (;;) { int syncRepState; /* Must reset the latch before testing state. */ ResetLatch(&MyProc->procLatch); /* * Try checking the state without the lock first. There's no * guarantee that we'll read the most up-to-date value, so if it looks * like we're still waiting, recheck while holding the lock. But if * it looks like we're done, we must really be done, because once * walsender changes the state to SYNC_REP_WAIT_COMPLETE, it will * never update it again, so we can't be seeing a stale value in that * case. * * Note: on machines with weak memory ordering, the acquisition of * the lock is essential to avoid race conditions: we cannot be sure * the sender's state update has reached main memory until we acquire * the lock. We could get rid of this dance if SetLatch/ResetLatch * contained memory barriers. */ syncRepState = MyProc->syncRepState; if (syncRepState == SYNC_REP_WAITING) { LWLockAcquire(SyncRepLock, LW_SHARED); syncRepState = MyProc->syncRepState; LWLockRelease(SyncRepLock); } if (syncRepState == SYNC_REP_WAIT_COMPLETE) break; /* * If a wait for synchronous replication is pending, we can neither * acknowledge the commit nor raise ERROR or FATAL. The latter would * lead the client to believe that that the transaction aborted, which * is not true: it's already committed locally. The former is no good * either: the client has requested synchronous replication, and is * entitled to assume that an acknowledged commit is also replicated, * which might not be true. So in this case we issue a WARNING (which * some clients may be able to interpret) and shut off further output. * We do NOT reset ProcDiePending, so that the process will die after * the commit is cleaned up. */ if (ProcDiePending) { ereport(WARNING, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"), errdetail("The transaction has already committed locally, but might not have been replicated to the standby."))); whereToSendOutput = DestNone; SyncRepCancelWait(); break; } /* * It's unclear what to do if a query cancel interrupt arrives. We * can't actually abort at this point, but ignoring the interrupt * altogether is not helpful, so we just terminate the wait with a * suitable warning. */ if (QueryCancelPending) { QueryCancelPending = false; ereport(WARNING, (errmsg("canceling wait for synchronous replication due to user request"), errdetail("The transaction has already committed locally, but might not have been replicated to the standby."))); SyncRepCancelWait(); break; } /* * If the postmaster dies, we'll probably never get an * acknowledgement, because all the wal sender processes will exit. So * just bail out. */ if (!PostmasterIsAlive()) { ProcDiePending = true; whereToSendOutput = DestNone; SyncRepCancelWait(); break; } /* * Wait on latch. Any condition that should wake us up will set * the latch, so no need for timeout. */ WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1); } /* * WalSender has checked our LSN and has removed us from queue. Clean up * state and leave. It's OK to reset these shared memory fields without * holding SyncRepLock, because any walsenders will ignore us anyway when * we're not on the queue. */ Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); MyProc->syncRepState = SYNC_REP_NOT_WAITING; MyProc->waitLSN.xlogid = 0; MyProc->waitLSN.xrecoff = 0; if (new_status) { /* Reset ps display */ set_ps_display(new_status, false); pfree(new_status); } }
Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; Datum result; MemoryContext oldcontext; BufferCachePagesContext *fctx; /* User function context. */ TupleDesc tupledesc; HeapTuple tuple; if (SRF_IS_FIRSTCALL()) { int i; volatile BufferDesc *bufHdr; funcctx = SRF_FIRSTCALL_INIT(); /* Switch context when allocating stuff to be used in later calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); /* Construct a tuple descriptor for the result rows. */ tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false); TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", INT2OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", INT8OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty", BOOLOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); /* Allocate NBuffers worth of BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) palloc(sizeof(BufferCachePagesRec) * NBuffers); /* Set max calls and remember the user function context. */ funcctx->max_calls = NBuffers; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); /* * To get a consistent picture of the buffer state, we must lock all * partitions of the buffer map. Needless to say, this is horrible * for concurrency. Must grab locks in increasing order to avoid * possible deadlocks. */ for (i = 0; i < NUM_BUFFER_PARTITIONS; i++) LWLockAcquire(FirstBufMappingLock + i, LW_SHARED); /* * Scan though all the buffers, saving the relevant fields in the * fctx->record structure. */ for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) { /* Lock each buffer header before inspecting. */ LockBufHdr(bufHdr); fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode; fctx->record[i].reltablespace = bufHdr->tag.rnode.spcNode; fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode; fctx->record[i].forknum = bufHdr->tag.forkNum; fctx->record[i].blocknum = bufHdr->tag.blockNum; fctx->record[i].usagecount = bufHdr->usage_count; if (bufHdr->flags & BM_DIRTY) fctx->record[i].isdirty = true; else fctx->record[i].isdirty = false; /* Note if the buffer is valid, and has storage created */ if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID)) fctx->record[i].isvalid = true; else fctx->record[i].isvalid = false; UnlockBufHdr(bufHdr); } /* * And release locks. We do this in reverse order for two reasons: * (1) Anyone else who needs more than one of the locks will be trying * to lock them in increasing order; we don't want to release the * other process until it can get all the locks it needs. (2) This * avoids O(N^2) behavior inside LWLockRelease. */ for (i = NUM_BUFFER_PARTITIONS; --i >= 0;) LWLockRelease(FirstBufMappingLock + i); } funcctx = SRF_PERCALL_SETUP(); /* Get the saved state */ fctx = funcctx->user_fctx; if (funcctx->call_cntr < funcctx->max_calls) { uint32 i = funcctx->call_cntr; Datum values[NUM_BUFFERCACHE_PAGES_ELEM]; bool nulls[NUM_BUFFERCACHE_PAGES_ELEM]; values[0] = Int32GetDatum(fctx->record[i].bufferid); nulls[0] = false; /* * Set all fields except the bufferid to null if the buffer is unused * or not valid. */ if (fctx->record[i].blocknum == InvalidBlockNumber || fctx->record[i].isvalid == false) { nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; } else { values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode); nulls[1] = false; values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); nulls[2] = false; values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); nulls[3] = false; values[4] = ObjectIdGetDatum(fctx->record[i].forknum); nulls[4] = false; values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); nulls[5] = false; values[6] = BoolGetDatum(fctx->record[i].isdirty); nulls[6] = false; values[7] = Int16GetDatum(fctx->record[i].usagecount); nulls[7] = false; } /* Build and return the tuple. */ tuple = heap_form_tuple(fctx->tupdesc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } else SRF_RETURN_DONE(funcctx); }
/* * Releases the pinCount of a SessionState entry. If the pinCount * drops to 0, it puts the entry back to the freeList for reuse. */ static void SessionState_Release(SessionState *acquired) { if (!sessionStateInited) { Assert(NULL == acquired); return; } Assert(NULL != acquired); Assert(0 < acquired->pinCount); Assert(acquired->sessionId == gp_session_id || acquired->isModifiedSessionId); LWLockAcquire(SessionStateLock, LW_EXCLUSIVE); Assert(!isProcessActive); Assert(acquired->activeProcessCount < acquired->pinCount); int pinCount = pg_atomic_sub_fetch_u32((pg_atomic_uint32 *) &acquired->pinCount, 1); ereport(gp_sessionstate_loglevel, (errmsg("SessionState_Release: pinCount: %d, activeProcessCount: %d", pinCount, acquired->activeProcessCount), errprintstack(true))); /* Before this point the process should have been deactivated */ Assert(acquired->activeProcessCount <= acquired->pinCount); Assert(0 <= acquired->pinCount); if (0 == acquired->pinCount) { RunawayCleaner_RunawayCleanupDoneForSession(); acquired->sessionId = INVALID_SESSION_ID; Assert(acquired->runawayStatus == RunawayStatus_NotRunaway); Assert(CLEANUP_COUNTDOWN_BEFORE_RUNAWAY == acquired->cleanupCountdown); Assert(0 == acquired->activeProcessCount); acquired->sessionVmem = 0; acquired->runawayStatus = RunawayStatus_NotRunaway; acquired->sessionVmemRunaway = 0; acquired->commandCountRunaway = 0; acquired->cleanupCountdown = CLEANUP_COUNTDOWN_BEFORE_RUNAWAY; acquired->activeProcessCount = 0; acquired->idle_start = 0; acquired->resGroupSlot = NULL; #ifdef USE_ASSERT_CHECKING acquired->isModifiedSessionId = false; #endif SessionState *cur = AllSessionStateEntries->usedList; SessionState *prev = NULL; while (cur != acquired && cur != NULL) { prev = cur; cur = cur->next; } Assert(cur == acquired); /* grabbed is at the head of used list */ if (NULL == prev) { Assert(AllSessionStateEntries->usedList == acquired); AllSessionStateEntries->usedList = acquired->next; } else { prev->next = cur->next; } acquired->next = AllSessionStateEntries->freeList; AllSessionStateEntries->freeList = acquired; AllSessionStateEntries->numSession--; Assert(AllSessionStateEntries->numSession >= 0); } LWLockRelease(SessionStateLock); }
/* * ProcSleep -- put a process to sleep on the specified lock * * Caller must have set MyProc->heldLocks to reflect locks already held * on the lockable object by this process (under all XIDs). * * The lock table's partition lock must be held at entry, and will be held * at exit. * * Result: STATUS_OK if we acquired the lock, STATUS_ERROR if not (deadlock). * * ASSUME: that no one will fiddle with the queue until after * we release the partition lock. * * NOTES: The process queue is now a priority queue for locking. * * P() on the semaphore should put us to sleep. The process * semaphore is normally zero, so when we try to acquire it, we sleep. */ int ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) { LOCKMODE lockmode = locallock->tag.mode; LOCK *lock = locallock->lock; PROCLOCK *proclock = locallock->proclock; uint32 hashcode = locallock->hashcode; LWLockId partitionLock = LockHashPartitionLock(hashcode); PROC_QUEUE *waitQueue = &(lock->waitProcs); LOCKMASK myHeldLocks = MyProc->heldLocks; bool early_deadlock = false; bool allow_autovacuum_cancel = true; int myWaitStatus; PGPROC *proc; int i; /* * Determine where to add myself in the wait queue. * * Normally I should go at the end of the queue. However, if I already * hold locks that conflict with the request of any previous waiter, put * myself in the queue just in front of the first such waiter. This is not * a necessary step, since deadlock detection would move me to before that * waiter anyway; but it's relatively cheap to detect such a conflict * immediately, and avoid delaying till deadlock timeout. * * Special case: if I find I should go in front of some waiter, check to * see if I conflict with already-held locks or the requests before that * waiter. If not, then just grant myself the requested lock immediately. * This is the same as the test for immediate grant in LockAcquire, except * we are only considering the part of the wait queue before my insertion * point. */ if (myHeldLocks != 0) { LOCKMASK aheadRequests = 0; proc = (PGPROC *) waitQueue->links.next; for (i = 0; i < waitQueue->size; i++) { /* Must he wait for me? */ if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks) { /* Must I wait for him ? */ if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks) { /* * Yes, so we have a deadlock. Easiest way to clean up * correctly is to call RemoveFromWaitQueue(), but we * can't do that until we are *on* the wait queue. So, set * a flag to check below, and break out of loop. Also, * record deadlock info for later message. */ RememberSimpleDeadLock(MyProc, lockmode, lock, proc); early_deadlock = true; break; } /* I must go before this waiter. Check special case. */ if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 && LockCheckConflicts(lockMethodTable, lockmode, lock, proclock, MyProc) == STATUS_OK) { /* Skip the wait and just grant myself the lock. */ GrantLock(lock, proclock, lockmode); GrantAwaitedLock(); return STATUS_OK; } /* Break out of loop to put myself before him */ break; } /* Nope, so advance to next waiter */ aheadRequests |= LOCKBIT_ON(proc->waitLockMode); proc = (PGPROC *) proc->links.next; } /* * If we fall out of loop normally, proc points to waitQueue head, so * we will insert at tail of queue as desired. */ } else { /* I hold no locks, so I can't push in front of anyone. */ proc = (PGPROC *) &(waitQueue->links); } /* * Insert self into queue, ahead of the given proc (or at tail of queue). */ SHMQueueInsertBefore(&(proc->links), &(MyProc->links)); waitQueue->size++; lock->waitMask |= LOCKBIT_ON(lockmode); /* Set up wait information in PGPROC object, too */ MyProc->waitLock = lock; MyProc->waitProcLock = proclock; MyProc->waitLockMode = lockmode; MyProc->waitStatus = STATUS_WAITING; /* * If we detected deadlock, give up without waiting. This must agree with * CheckDeadLock's recovery code, except that we shouldn't release the * semaphore since we haven't tried to lock it yet. */ if (early_deadlock) { RemoveFromWaitQueue(MyProc, hashcode); return STATUS_ERROR; } /* mark that we are waiting for a lock */ lockAwaited = locallock; /* * Release the lock table's partition lock. * * NOTE: this may also cause us to exit critical-section state, possibly * allowing a cancel/die interrupt to be accepted. This is OK because we * have recorded the fact that we are waiting for a lock, and so * LockErrorCleanup will clean up if cancel/die happens. */ LWLockRelease(partitionLock); /* * Also, now that we will successfully clean up after an ereport, it's * safe to check to see if there's a buffer pin deadlock against the * Startup process. Of course, that's only necessary if we're doing Hot * Standby and are not the Startup process ourselves. */ if (RecoveryInProgress() && !InRecovery) CheckRecoveryConflictDeadlock(); /* Reset deadlock_state before enabling the timeout handler */ deadlock_state = DS_NOT_YET_CHECKED; /* * Set timer so we can wake up after awhile and check for a deadlock. If a * deadlock is detected, the handler releases the process's semaphore and * sets MyProc->waitStatus = STATUS_ERROR, allowing us to know that we * must report failure rather than success. * * By delaying the check until we've waited for a bit, we can avoid * running the rather expensive deadlock-check code in most cases. */ enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout); /* * If someone wakes us between LWLockRelease and PGSemaphoreLock, * PGSemaphoreLock will not block. The wakeup is "saved" by the semaphore * implementation. While this is normally good, there are cases where a * saved wakeup might be leftover from a previous operation (for example, * we aborted ProcWaitForSignal just before someone did ProcSendSignal). * So, loop to wait again if the waitStatus shows we haven't been granted * nor denied the lock yet. * * We pass interruptOK = true, which eliminates a window in which * cancel/die interrupts would be held off undesirably. This is a promise * that we don't mind losing control to a cancel/die interrupt here. We * don't, because we have no shared-state-change work to do after being * granted the lock (the grantor did it all). We do have to worry about * canceling the deadlock timeout and updating the locallock table, but if * we lose control to an error, LockErrorCleanup will fix that up. */ do { PGSemaphoreLock(&MyProc->sem, true); /* * waitStatus could change from STATUS_WAITING to something else * asynchronously. Read it just once per loop to prevent surprising * behavior (such as missing log messages). */ myWaitStatus = MyProc->waitStatus; /* * If we are not deadlocked, but are waiting on an autovacuum-induced * task, send a signal to interrupt it. */ if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel) { PGPROC *autovac = GetBlockingAutoVacuumPgproc(); PGXACT *autovac_pgxact = &ProcGlobal->allPgXact[autovac->pgprocno]; LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); /* * Only do it if the worker is not working to protect against Xid * wraparound. */ if ((autovac != NULL) && (autovac_pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) && !(autovac_pgxact->vacuumFlags & PROC_VACUUM_FOR_WRAPAROUND)) { int pid = autovac->pid; StringInfoData locktagbuf; StringInfoData logbuf; /* errdetail for server log */ initStringInfo(&locktagbuf); initStringInfo(&logbuf); DescribeLockTag(&locktagbuf, &lock->tag); appendStringInfo(&logbuf, _("Process %d waits for %s on %s."), MyProcPid, GetLockmodeName(lock->tag.locktag_lockmethodid, lockmode), locktagbuf.data); /* release lock as quickly as possible */ LWLockRelease(ProcArrayLock); ereport(LOG, (errmsg("sending cancel to blocking autovacuum PID %d", pid), errdetail_log("%s", logbuf.data))); pfree(logbuf.data); pfree(locktagbuf.data); /* send the autovacuum worker Back to Old Kent Road */ if (kill(pid, SIGINT) < 0) { /* Just a warning to allow multiple callers */ ereport(WARNING, (errmsg("could not send signal to process %d: %m", pid))); } } else LWLockRelease(ProcArrayLock); /* prevent signal from being resent more than once */ allow_autovacuum_cancel = false; } /* * If awoken after the deadlock check interrupt has run, and * log_lock_waits is on, then report about the wait. */ if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED) { StringInfoData buf; const char *modename; long secs; int usecs; long msecs; initStringInfo(&buf); DescribeLockTag(&buf, &locallock->tag.lock); modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid, lockmode); TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT), GetCurrentTimestamp(), &secs, &usecs); msecs = secs * 1000 + usecs / 1000; usecs = usecs % 1000; if (deadlock_state == DS_SOFT_DEADLOCK) ereport(LOG, (errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs))); else if (deadlock_state == DS_HARD_DEADLOCK) { /* * This message is a bit redundant with the error that will be * reported subsequently, but in some cases the error report * might not make it to the log (eg, if it's caught by an * exception handler), and we want to ensure all long-wait * events get logged. */ ereport(LOG, (errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs))); } if (myWaitStatus == STATUS_WAITING) ereport(LOG, (errmsg("process %d still waiting for %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs))); else if (myWaitStatus == STATUS_OK) ereport(LOG, (errmsg("process %d acquired %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs))); else { Assert(myWaitStatus == STATUS_ERROR); /* * Currently, the deadlock checker always kicks its own * process, which means that we'll only see STATUS_ERROR when * deadlock_state == DS_HARD_DEADLOCK, and there's no need to * print redundant messages. But for completeness and * future-proofing, print a message if it looks like someone * else kicked us off the lock. */ if (deadlock_state != DS_HARD_DEADLOCK) ereport(LOG, (errmsg("process %d failed to acquire %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs))); } /* * At this point we might still need to wait for the lock. Reset * state so we don't print the above messages again. */ deadlock_state = DS_NO_DEADLOCK; pfree(buf.data); } } while (myWaitStatus == STATUS_WAITING); /* * Disable the timer, if it's still running */ disable_timeout(DEADLOCK_TIMEOUT, false); /* * Re-acquire the lock table's partition lock. We have to do this to hold * off cancel/die interrupts before we can mess with lockAwaited (else we * might have a missed or duplicated locallock update). */ LWLockAcquire(partitionLock, LW_EXCLUSIVE); /* * We no longer want LockErrorCleanup to do anything. */ lockAwaited = NULL; /* * If we got the lock, be sure to remember it in the locallock table. */ if (MyProc->waitStatus == STATUS_OK) GrantAwaitedLock(); /* * We don't have to do anything else, because the awaker did all the * necessary update of the lock table and MyProc. */ return MyProc->waitStatus; }
/* * Grabs one entry in the sessionStateArray for current session. * If the current session already has an entry, it just returns the * pointer to the previously grabbed entry. */ static SessionState* SessionState_Acquire(int sessionId) { LWLockAcquire(SessionStateLock, LW_EXCLUSIVE); SessionState *cur = AllSessionStateEntries->usedList; while (cur != NULL && cur->sessionId != sessionId) { Assert(INVALID_SESSION_ID != cur->sessionId); cur = cur->next; } if (NULL == cur && NULL == AllSessionStateEntries->freeList) { LWLockRelease(SessionStateLock); ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("Too many sessions."), errdetail("Could not acquire resources for additional sessions."), errhint("Disconnect some sessions and try again."))); } SessionState *acquired = cur; /* * Nothing was acquired for this session from any other processes. Therefore, * acquire a new entry, and reset its properties. */ if (NULL == acquired) { acquired = AllSessionStateEntries->freeList; Assert(INVALID_SESSION_ID == acquired->sessionId && acquired->runawayStatus == RunawayStatus_NotRunaway && 0 == acquired->pinCount && CLEANUP_COUNTDOWN_BEFORE_RUNAWAY == acquired->cleanupCountdown && 0 == acquired->activeProcessCount && 0 == acquired->sessionVmem && 0 == acquired->spinLock && 0 == acquired->sessionVmemRunaway && 0 == acquired->commandCountRunaway && !acquired->isModifiedSessionId); AllSessionStateEntries->freeList = acquired->next; acquired->next = AllSessionStateEntries->usedList; AllSessionStateEntries->usedList = acquired; AllSessionStateEntries->numSession++; Assert(AllSessionStateEntries->numSession <= AllSessionStateEntries->maxSession); acquired->sessionId = sessionId; acquired->runawayStatus = RunawayStatus_NotRunaway; acquired->sessionVmemRunaway = 0; acquired->commandCountRunaway = 0; acquired->pinCount = 0; acquired->sessionVmem = 0; acquired->cleanupCountdown = CLEANUP_COUNTDOWN_BEFORE_RUNAWAY; acquired->activeProcessCount = 0; acquired->idle_start = 0; acquired->resGroupSlot = NULL; #ifdef USE_ASSERT_CHECKING acquired->isModifiedSessionId = false; #endif /* * Make sure that the lock is reset to released. Note: this doesn't * have a matching SpinLockAcquire. We are just resetting the lock * as part of initialization */ SpinLockRelease(&acquired->spinLock); } Assert(NULL != acquired); int pinCount = pg_atomic_add_fetch_u32((pg_atomic_uint32 *) &acquired->pinCount, 1); ereport(gp_sessionstate_loglevel, (errmsg("SessionState_Acquire: pinCount: %d, activeProcessCount: %d", pinCount, acquired->activeProcessCount), errprintstack(true))); LWLockRelease(SessionStateLock); return acquired; }
/* * We have to cut&paste copde of GetNewTransactionId from varsup because we change way of advancing ShmemVariableCache->nextXid */ TransactionId DtmGetNewTransactionId(bool isSubXact) { TransactionId xid; XTM_INFO("%d: GetNewTransactionId\n", getpid()); /* * Workers synchronize transaction state at the beginning of each parallel * operation, so we can't account for new XIDs after that point. */ if (IsInParallelMode()) elog(ERROR, "cannot assign TransactionIds during a parallel operation"); /* * During bootstrap initialization, we return the special bootstrap * transaction id. */ if (IsBootstrapProcessingMode()) { Assert(!isSubXact); MyPgXact->xid = BootstrapTransactionId; return BootstrapTransactionId; } /* safety check, we should never get this far in a HS slave */ if (RecoveryInProgress()) elog(ERROR, "cannot assign TransactionIds during recovery"); LWLockAcquire(XidGenLock, LW_EXCLUSIVE); xid = DtmGetNextXid(); /*---------- * Check to see if it's safe to assign another XID. This protects against * catastrophic data loss due to XID wraparound. The basic rules are: * * If we're past xidVacLimit, start trying to force autovacuum cycles. * If we're past xidWarnLimit, start issuing warnings. * If we're past xidStopLimit, refuse to execute transactions, unless * we are running in single-user mode (which gives an escape hatch * to the DBA who somehow got past the earlier defenses). * * Note that this coding also appears in GetNewMultiXactId. *---------- */ if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidVacLimit)) { /* * For safety's sake, we release XidGenLock while sending signals, * warnings, etc. This is not so much because we care about * preserving concurrency in this situation, as to avoid any * possibility of deadlock while doing get_database_name(). First, * copy all the shared values we'll need in this path. */ TransactionId xidWarnLimit = ShmemVariableCache->xidWarnLimit; TransactionId xidStopLimit = ShmemVariableCache->xidStopLimit; TransactionId xidWrapLimit = ShmemVariableCache->xidWrapLimit; Oid oldest_datoid = ShmemVariableCache->oldestXidDB; LWLockRelease(XidGenLock); /* * To avoid swamping the postmaster with signals, we issue the autovac * request only once per 64K transaction starts. This still gives * plenty of chances before we get into real trouble. */ if (IsUnderPostmaster && (xid % 65536) == 0) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); if (IsUnderPostmaster && TransactionIdFollowsOrEquals(xid, xidStopLimit)) { char *oldest_datname = get_database_name(oldest_datoid); /* complain even if that DB has disappeared */ if (oldest_datname) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("database is not accepting commands to avoid wraparound data loss in database \"%s\"", oldest_datname), errhint("Stop the postmaster and vacuum that database in single-user mode.\n" "You might also need to commit or roll back old prepared transactions."))); else ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("database is not accepting commands to avoid wraparound data loss in database with OID %u", oldest_datoid), errhint("Stop the postmaster and vacuum that database in single-user mode.\n" "You might also need to commit or roll back old prepared transactions."))); } else if (TransactionIdFollowsOrEquals(xid, xidWarnLimit)) { char *oldest_datname = get_database_name(oldest_datoid); /* complain even if that DB has disappeared */ if (oldest_datname) ereport(WARNING, (errmsg("database \"%s\" must be vacuumed within %u transactions", oldest_datname, xidWrapLimit - xid), errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" "You might also need to commit or roll back old prepared transactions."))); else ereport(WARNING, (errmsg("database with OID %u must be vacuumed within %u transactions", oldest_datoid, xidWrapLimit - xid), errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" "You might also need to commit or roll back old prepared transactions."))); } /* Re-acquire lock and start over */ LWLockAcquire(XidGenLock, LW_EXCLUSIVE); xid = DtmGetNextXid(); } /* * If we are allocating the first XID of a new page of the commit log, * zero out that commit-log page before returning. We must do this while * holding XidGenLock, else another xact could acquire and commit a later * XID before we zero the page. Fortunately, a page of the commit log * holds 32K or more transactions, so we don't have to do this very often. * * Extend pg_subtrans and pg_commit_ts too. */ if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->nextXid)) { ExtendCLOG(xid); ExtendCommitTs(xid); ExtendSUBTRANS(xid); } /* * Now advance the nextXid counter. This must not happen until after we * have successfully completed ExtendCLOG() --- if that routine fails, we * want the next incoming transaction to try it again. We cannot assign * more XIDs until there is CLOG space for them. */ if (xid == ShmemVariableCache->nextXid) TransactionIdAdvance(ShmemVariableCache->nextXid); else Assert(TransactionIdPrecedes(xid, ShmemVariableCache->nextXid)); /* * We must store the new XID into the shared ProcArray before releasing * XidGenLock. This ensures that every active XID older than * latestCompletedXid is present in the ProcArray, which is essential for * correct OldestXmin tracking; see src/backend/access/transam/README. * * XXX by storing xid into MyPgXact without acquiring ProcArrayLock, we * are relying on fetch/store of an xid to be atomic, else other backends * might see a partially-set xid here. But holding both locks at once * would be a nasty concurrency hit. So for now, assume atomicity. * * Note that readers of PGXACT xid fields should be careful to fetch the * value only once, rather than assume they can read a value multiple * times and get the same answer each time. * * The same comments apply to the subxact xid count and overflow fields. * * A solution to the atomic-store problem would be to give each PGXACT its * own spinlock used only for fetching/storing that PGXACT's xid and * related fields. * * If there's no room to fit a subtransaction XID into PGPROC, set the * cache-overflowed flag instead. This forces readers to look in * pg_subtrans to map subtransaction XIDs up to top-level XIDs. There is a * race-condition window, in that the new XID will not appear as running * until its parent link has been placed into pg_subtrans. However, that * will happen before anyone could possibly have a reason to inquire about * the status of the XID, so it seems OK. (Snapshots taken during this * window *will* include the parent XID, so they will deliver the correct * answer later on when someone does have a reason to inquire.) */ { /* * Use volatile pointer to prevent code rearrangement; other backends * could be examining my subxids info concurrently, and we don't want * them to see an invalid intermediate state, such as incrementing * nxids before filling the array entry. Note we are assuming that * TransactionId and int fetch/store are atomic. */ volatile PGPROC *myproc = MyProc; volatile PGXACT *mypgxact = MyPgXact; if (!isSubXact) mypgxact->xid = xid; else { int nxids = mypgxact->nxids; if (nxids < PGPROC_MAX_CACHED_SUBXIDS) { myproc->subxids.xids[nxids] = xid; mypgxact->nxids = nxids + 1; } else mypgxact->overflowed = true; } } LWLockRelease(XidGenLock); return xid; }
/* * shmem_startup hook: allocate or attach to shared memory, * then load any pre-existing statistics from file. */ static void pgss_shmem_startup(void) { bool found; HASHCTL info; FILE *file; uint32 header; int32 num; int32 i; int query_size; int buffer_size; char *buffer = NULL; if (prev_shmem_startup_hook) prev_shmem_startup_hook(); /* reset in case this is a restart within the postmaster */ pgss = NULL; pgss_hash = NULL; /* * Create or attach to the shared memory state, including hash table */ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); pgss = ShmemInitStruct("pg_stat_statements", sizeof(pgssSharedState), &found); if (!found) { /* First time through ... */ pgss->lock = LWLockAssign(); pgss->query_size = pgstat_track_activity_query_size; } /* Be sure everyone agrees on the hash table entry size */ query_size = pgss->query_size; memset(&info, 0, sizeof(info)); info.keysize = sizeof(pgssHashKey); info.entrysize = offsetof(pgssEntry, query) +query_size; info.hash = pgss_hash_fn; info.match = pgss_match_fn; pgss_hash = ShmemInitHash("pg_stat_statements hash", pgss_max, pgss_max, &info, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); LWLockRelease(AddinShmemInitLock); /* * If we're in the postmaster (or a standalone backend...), set up a shmem * exit hook to dump the statistics to disk. */ if (!IsUnderPostmaster) on_shmem_exit(pgss_shmem_shutdown, (Datum) 0); /* * Attempt to load old statistics from the dump file, if this is the first * time through and we weren't told not to. */ if (found || !pgss_save) return; /* * Note: we don't bother with locks here, because there should be no other * processes running when this code is reached. */ file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R); if (file == NULL) { if (errno == ENOENT) return; /* ignore not-found error */ goto error; } buffer_size = query_size; buffer = (char *) palloc(buffer_size); if (fread(&header, sizeof(uint32), 1, file) != 1 || header != PGSS_FILE_HEADER || fread(&num, sizeof(int32), 1, file) != 1) goto error; for (i = 0; i < num; i++) { pgssEntry temp; pgssEntry *entry; if (fread(&temp, offsetof(pgssEntry, mutex), 1, file) != 1) goto error; /* Encoding is the only field we can easily sanity-check */ if (!PG_VALID_BE_ENCODING(temp.key.encoding)) goto error; /* Previous incarnation might have had a larger query_size */ if (temp.key.query_len >= buffer_size) { buffer = (char *) repalloc(buffer, temp.key.query_len + 1); buffer_size = temp.key.query_len + 1; } if (fread(buffer, 1, temp.key.query_len, file) != temp.key.query_len) goto error; buffer[temp.key.query_len] = '\0'; /* Clip to available length if needed */ if (temp.key.query_len >= query_size) temp.key.query_len = pg_encoding_mbcliplen(temp.key.encoding, buffer, temp.key.query_len, query_size - 1); temp.key.query_ptr = buffer; /* make the hashtable entry (discards old entries if too many) */ entry = entry_alloc(&temp.key); /* copy in the actual stats */ entry->counters = temp.counters; } pfree(buffer); FreeFile(file); return; error: ereport(LOG, (errcode_for_file_access(), errmsg("could not read pg_stat_statement file \"%s\": %m", PGSS_DUMP_FILE))); if (buffer) pfree(buffer); if (file) FreeFile(file); /* If possible, throw away the bogus file; ignore any error */ unlink(PGSS_DUMP_FILE); }
void ftsLock(void) { LWLockAcquire(ftsControlLock, LW_EXCLUSIVE); }
/* * Store some statistics for a statement. */ static void pgss_store(const char *query, double total_time, uint64 rows, const BufferUsage *bufusage) { pgssHashKey key; double usage; pgssEntry *entry; Assert(query != NULL); /* Safety check... */ if (!pgss || !pgss_hash) return; /* Set up key for hashtable search */ key.userid = GetUserId(); key.dbid = MyDatabaseId; key.encoding = GetDatabaseEncoding(); key.query_len = strlen(query); if (key.query_len >= pgss->query_size) key.query_len = pg_encoding_mbcliplen(key.encoding, query, key.query_len, pgss->query_size - 1); key.query_ptr = query; usage = USAGE_EXEC(duration); /* Lookup the hash table entry with shared lock. */ LWLockAcquire(pgss->lock, LW_SHARED); entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL); if (!entry) { /* Must acquire exclusive lock to add a new entry. */ LWLockRelease(pgss->lock); LWLockAcquire(pgss->lock, LW_EXCLUSIVE); entry = entry_alloc(&key); } /* Grab the spinlock while updating the counters. */ { volatile pgssEntry *e = (volatile pgssEntry *) entry; SpinLockAcquire(&e->mutex); e->counters.calls += 1; e->counters.total_time += total_time; e->counters.rows += rows; e->counters.shared_blks_hit += bufusage->shared_blks_hit; e->counters.shared_blks_read += bufusage->shared_blks_read; e->counters.shared_blks_written += bufusage->shared_blks_written; e->counters.local_blks_hit += bufusage->local_blks_hit; e->counters.local_blks_read += bufusage->local_blks_read; e->counters.local_blks_written += bufusage->local_blks_written; e->counters.temp_blks_read += bufusage->temp_blks_read; e->counters.temp_blks_written += bufusage->temp_blks_written; e->counters.usage += usage; SpinLockRelease(&e->mutex); } LWLockRelease(pgss->lock); }
/* * Permanently drop the currently acquired replication slot which will be * released by the point this function returns. */ static void ReplicationSlotDropAcquired(void) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; ReplicationSlot *slot = MyReplicationSlot; Assert(MyReplicationSlot != NULL); /* slot isn't acquired anymore */ MyReplicationSlot = NULL; /* * If some other backend ran this code concurrently with us, we might try * to delete a slot with a certain name while someone else was trying to * create a slot with the same name. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); /* Generate pathnames. */ sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the * slot inactive before bailing out. If we're dropping an ephemeral slot, * we better never fail hard as the caller won't expect the slot to * survive and this might get called during error handling. */ if (rename(path, tmppath) == 0) { /* * We need to fsync() the directory we just renamed and its parent to * make sure that our changes are on disk in a crash-safe fashion. If * fsync() fails, we can't be sure whether the changes are on disk or * not. For now, we handle that by panicking; * StartupReplicationSlots() will try to straighten it out after * restart. */ START_CRIT_SECTION(); fsync_fname(tmppath, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); } else { volatile ReplicationSlot *vslot = slot; bool fail_softly = slot->data.persistency == RS_EPHEMERAL; SpinLockAcquire(&slot->mutex); vslot->active_pid = 0; SpinLockRelease(&slot->mutex); ereport(fail_softly ? WARNING : ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", path, tmppath))); } /* * The slot is definitely gone. Lock out concurrent scans of the array * long enough to kill it. It's OK to clear the active flag here without * grabbing the mutex because nobody else can be scanning the array here, * and nobody can be attached to this slot and thus access it without * scanning the array. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); slot->active_pid = 0; slot->in_use = false; LWLockRelease(ReplicationSlotControlLock); /* * Slot is dead and doesn't prevent resource removal anymore, recompute * limits. */ ReplicationSlotsComputeRequiredXmin(false); ReplicationSlotsComputeRequiredLSN(); /* * If removing the directory fails, the worst thing that will happen is * that the user won't be able to create a new___ slot with the same name * until the next server restart. We warn about it, but that's all. */ if (!rmtree(tmppath, true)) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", tmppath))); /* * We release this at the very end, so that nobody starts trying to create * a slot while we're still cleaning up the detritus of the old one. */ LWLockRelease(ReplicationSlotAllocationLock); }
/* * Retrieve statement statistics. */ Datum pg_stat_statements(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; Tuplestorestate *tupstore; MemoryContext per_query_ctx; MemoryContext oldcontext; Oid userid = GetUserId(); bool is_superuser = superuser(); HASH_SEQ_STATUS hash_seq; pgssEntry *entry; if (!pgss || !pgss_hash) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("pg_stat_statements must be loaded via shared_preload_libraries"))); /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not " \ "allowed in this context"))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); LWLockAcquire(pgss->lock, LW_SHARED); hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { Datum values[PG_STAT_STATEMENTS_COLS]; bool nulls[PG_STAT_STATEMENTS_COLS]; int i = 0; Counters tmp; memset(values, 0, sizeof(values)); memset(nulls, 0, sizeof(nulls)); values[i++] = ObjectIdGetDatum(entry->key.userid); values[i++] = ObjectIdGetDatum(entry->key.dbid); if (is_superuser || entry->key.userid == userid) { char *qstr; qstr = (char *) pg_do_encoding_conversion((unsigned char *) entry->query, entry->key.query_len, entry->key.encoding, GetDatabaseEncoding()); values[i++] = CStringGetTextDatum(qstr); if (qstr != entry->query) pfree(qstr); } else values[i++] = CStringGetTextDatum("<insufficient privilege>"); /* copy counters to a local variable to keep locking time short */ { volatile pgssEntry *e = (volatile pgssEntry *) entry; SpinLockAcquire(&e->mutex); tmp = e->counters; SpinLockRelease(&e->mutex); } values[i++] = Int64GetDatumFast(tmp.calls); values[i++] = Float8GetDatumFast(tmp.total_time); values[i++] = Int64GetDatumFast(tmp.rows); values[i++] = Int64GetDatumFast(tmp.shared_blks_hit); values[i++] = Int64GetDatumFast(tmp.shared_blks_read); values[i++] = Int64GetDatumFast(tmp.shared_blks_written); values[i++] = Int64GetDatumFast(tmp.local_blks_hit); values[i++] = Int64GetDatumFast(tmp.local_blks_read); values[i++] = Int64GetDatumFast(tmp.local_blks_written); values[i++] = Int64GetDatumFast(tmp.temp_blks_read); values[i++] = Int64GetDatumFast(tmp.temp_blks_written); Assert(i == PG_STAT_STATEMENTS_COLS); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } LWLockRelease(pgss->lock); /* clean up and return the tuplestore */ tuplestore_donestoring(tupstore); return (Datum) 0; }
/* * Returns activity of walsenders, including pids and xlog locations sent to * standby servers. */ Datum pg_stat_get_wal_senders(PG_FUNCTION_ARGS) { #define PG_STAT_GET_WAL_SENDERS_COLS 8 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; Tuplestorestate *tupstore; MemoryContext per_query_ctx; MemoryContext oldcontext; int *sync_priority; int priority = 0; int sync_standby = -1; int i; /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not " \ "allowed in this context"))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); /* * Get the priorities of sync standbys all in one go, to minimise lock * acquisitions and to allow us to evaluate who is the current sync * standby. This code must match the code in SyncRepReleaseWaiters(). */ sync_priority = palloc(sizeof(int) * max_wal_senders); LWLockAcquire(SyncRepLock, LW_SHARED); for (i = 0; i < max_wal_senders; i++) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = &WalSndCtl->walsnds[i]; if (walsnd->pid != 0) { sync_priority[i] = walsnd->sync_standby_priority; if (walsnd->state == WALSNDSTATE_STREAMING && walsnd->sync_standby_priority > 0 && (priority == 0 || priority > walsnd->sync_standby_priority)) { priority = walsnd->sync_standby_priority; sync_standby = i; } } } LWLockRelease(SyncRepLock); for (i = 0; i < max_wal_senders; i++) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = &WalSndCtl->walsnds[i]; char location[MAXFNAMELEN]; XLogRecPtr sentPtr; XLogRecPtr write; XLogRecPtr flush; XLogRecPtr apply; WalSndState state; Datum values[PG_STAT_GET_WAL_SENDERS_COLS]; bool nulls[PG_STAT_GET_WAL_SENDERS_COLS]; if (walsnd->pid == 0) continue; SpinLockAcquire(&walsnd->mutex); sentPtr = walsnd->sentPtr; state = walsnd->state; write = walsnd->write; flush = walsnd->flush; apply = walsnd->apply; SpinLockRelease(&walsnd->mutex); memset(nulls, 0, sizeof(nulls)); values[0] = Int32GetDatum(walsnd->pid); if (!superuser()) { /* * Only superusers can see details. Other users only get the pid * value to know it's a walsender, but no details. */ MemSet(&nulls[1], true, PG_STAT_GET_WAL_SENDERS_COLS - 1); } else { values[1] = CStringGetTextDatum(WalSndGetStateString(state)); snprintf(location, sizeof(location), "%X/%X", sentPtr.xlogid, sentPtr.xrecoff); values[2] = CStringGetTextDatum(location); if (write.xlogid == 0 && write.xrecoff == 0) nulls[3] = true; snprintf(location, sizeof(location), "%X/%X", write.xlogid, write.xrecoff); values[3] = CStringGetTextDatum(location); if (flush.xlogid == 0 && flush.xrecoff == 0) nulls[4] = true; snprintf(location, sizeof(location), "%X/%X", flush.xlogid, flush.xrecoff); values[4] = CStringGetTextDatum(location); if (apply.xlogid == 0 && apply.xrecoff == 0) nulls[5] = true; snprintf(location, sizeof(location), "%X/%X", apply.xlogid, apply.xrecoff); values[5] = CStringGetTextDatum(location); values[6] = Int32GetDatum(sync_priority[i]); /* * More easily understood version of standby state. This is purely * informational, not different from priority. */ if (sync_priority[i] == 0) values[7] = CStringGetTextDatum("async"); else if (i == sync_standby) values[7] = CStringGetTextDatum("sync"); else values[7] = CStringGetTextDatum("potential"); } tuplestore_putvalues(tupstore, tupdesc, values, nulls); } pfree(sync_priority); /* clean up and return the tuplestore */ tuplestore_donestoring(tupstore); return (Datum) 0; }
/* * CheckPointRelationMap * * This is called during a checkpoint. It must ensure that any relation map * updates that were WAL-logged before the start of the checkpoint are * securely flushed to disk and will not need to be replayed later. This * seems unlikely to be a performance-critical issue, so we use a simple * method: we just take and release the RelationMappingLock. This ensures * that any already-logged map update is complete, because write_relmap_file * will fsync the map file before the lock is released. */ void CheckPointRelationMap(void) { LWLockAcquire(RelationMappingLock, LW_SHARED); LWLockRelease(RelationMappingLock); }