/* * Find a page in a shared buffer, reading it in if necessary. * The page number must correspond to an already-initialized page. * * If write_ok is true then it is OK to return a page that is in * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure * that modification of the page is safe. If write_ok is false then we * will not return the page until it is not undergoing active I/O. * * The passed-in xid is used only for error reporting, and may be * InvalidTransactionId if no specific xid is associated with the action. * * Return value is the shared-buffer slot number now holding the page. * The buffer's LRU access info is updated. * * Control lock must be held at entry, and will be held at exit. */ int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, TransactionId xid) { SlruShared shared = ctl->shared; /* Outer loop handles restart if we must wait for someone else's I/O */ for (;;) { int slotno; bool ok; /* See if page already is in memory; if not, pick victim slot */ slotno = SlruSelectLRUPage(ctl, pageno); /* Did we find the page in memory? */ if (shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_EMPTY) { /* * If page is still being read in, we must wait for I/O. Likewise * if the page is being written and the caller said that's not OK. */ if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS || (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && !write_ok)) { SimpleLruWaitIO(ctl, slotno); /* Now we must recheck state from the top */ continue; } /* Otherwise, it's ready to use */ SlruRecentlyUsed(shared, slotno); return slotno; } /* We found no match; assert we selected a freeable slot */ Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno])); /* Mark the slot read-busy */ shared->page_number[slotno] = pageno; shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS; shared->page_dirty[slotno] = false; /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE); /* Release control lock while doing I/O */ LWLockRelease(shared->ControlLock); /* Do the read */ ok = SlruPhysicalReadPage(ctl, pageno, slotno); /* Set the LSNs for this newly read-in page to zero */ SimpleLruZeroLSNs(ctl, slotno); /* Re-acquire control lock and update page state */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS && !shared->page_dirty[slotno]); shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY; LWLockRelease(shared->buffer_locks[slotno]); /* Now it's okay to ereport if we failed */ if (!ok) SlruReportIOError(ctl, pageno, xid); SlruRecentlyUsed(shared, slotno); return slotno; } }
/* * SIInsertDataEntries * Add new invalidation message(s) to the buffer. */ void SIInsertDataEntries(const SharedInvalidationMessage *data, int n) { SISeg *segP = shmInvalBuffer; /* * N can be arbitrarily large. We divide the work into groups of no more * than WRITE_QUANTUM messages, to be sure that we don't hold the lock for * an unreasonably long time. (This is not so much because we care about * letting in other writers, as that some just-caught-up backend might be * trying to do SICleanupQueue to pass on its signal, and we don't want it * to have to wait a long time.) Also, we need to consider calling * SICleanupQueue every so often. */ while (n > 0) { int nthistime = Min(n, WRITE_QUANTUM); int numMsgs; int max; n -= nthistime; LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); /* * If the buffer is full, we *must* acquire some space. Clean the * queue and reset anyone who is preventing space from being freed. * Otherwise, clean the queue only when it's exceeded the next * fullness threshold. We have to loop and recheck the buffer state * after any call of SICleanupQueue. */ for (;;) { numMsgs = segP->maxMsgNum - segP->minMsgNum; if (numMsgs + nthistime > MAXNUMMESSAGES || numMsgs >= segP->nextThreshold) SICleanupQueue(true, nthistime); else break; } /* * Insert new message(s) into proper slot of circular buffer */ max = segP->maxMsgNum; while (nthistime-- > 0) { segP->buffer[max % MAXNUMMESSAGES] = *data++; max++; } /* Update current value of maxMsgNum using spinlock */ { /* use volatile pointer to prevent code rearrangement */ volatile SISeg *vsegP = segP; SpinLockAcquire(&vsegP->msgnumLock); vsegP->maxMsgNum = max; SpinLockRelease(&vsegP->msgnumLock); } LWLockRelease(SInvalWriteLock); } }
/* * SICleanupQueue * Remove messages that have been consumed by all active backends * * callerHasWriteLock is TRUE if caller is holding SInvalWriteLock. * minFree is the minimum number of message slots to make free. * * Possible side effects of this routine include marking one or more * backends as "reset" in the array, and sending PROCSIG_CATCHUP_INTERRUPT * to some backend that seems to be getting too far behind. We signal at * most one backend at a time, for reasons explained at the top of the file. * * Caution: because we transiently release write lock when we have to signal * some other backend, it is NOT guaranteed that there are still minFree * free message slots at exit. Caller must recheck and perhaps retry. */ void SICleanupQueue(bool callerHasWriteLock, int minFree) { SISeg *segP = shmInvalBuffer; int min, minsig, lowbound, numMsgs, i; ProcState *needSig = NULL; /* Lock out all writers and readers */ if (!callerHasWriteLock) LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); LWLockAcquire(SInvalReadLock, LW_EXCLUSIVE); /* * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify the * furthest-back backend that needs signaling (if any), and reset any * backends that are too far back. Note that because we ignore sendOnly * backends here it is possible for them to keep sending messages without * a problem even when they are the only active backend. */ min = segP->maxMsgNum; minsig = min - SIG_THRESHOLD; lowbound = min - MAXNUMMESSAGES + minFree; for (i = 0; i < segP->lastBackend; i++) { ProcState *stateP = &segP->procState[i]; int n = stateP->nextMsgNum; /* Ignore if inactive or already in reset state */ if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly) continue; /* * If we must free some space and this backend is preventing it, force * him into reset state and then ignore until he catches up. */ if (n < lowbound) { stateP->resetState = true; /* no point in signaling him ... */ continue; } /* Track the global minimum nextMsgNum */ if (n < min) min = n; /* Also see who's furthest back of the unsignaled backends */ if (n < minsig && !stateP->signaled) { minsig = n; needSig = stateP; } } segP->minMsgNum = min; /* * When minMsgNum gets really large, decrement all message counters so as * to forestall overflow of the counters. This happens seldom enough that * folding it into the previous loop would be a loser. */ if (min >= MSGNUMWRAPAROUND) { segP->minMsgNum -= MSGNUMWRAPAROUND; segP->maxMsgNum -= MSGNUMWRAPAROUND; for (i = 0; i < segP->lastBackend; i++) { /* we don't bother skipping inactive entries here */ segP->procState[i].nextMsgNum -= MSGNUMWRAPAROUND; } } /* * Determine how many messages are still in the queue, and set the * threshold at which we should repeat SICleanupQueue(). */ numMsgs = segP->maxMsgNum - segP->minMsgNum; if (numMsgs < CLEANUP_MIN) segP->nextThreshold = CLEANUP_MIN; else segP->nextThreshold = (numMsgs / CLEANUP_QUANTUM + 1) * CLEANUP_QUANTUM; /* * Lastly, signal anyone who needs a catchup interrupt. Since * SendProcSignal() might not be fast, we don't want to hold locks while * executing it. */ if (needSig) { pid_t his_pid = needSig->procPid; BackendId his_backendId = (needSig - &segP->procState[0]) + 1; needSig->signaled = true; LWLockRelease(SInvalReadLock); LWLockRelease(SInvalWriteLock); elog(DEBUG4, "sending sinval catchup signal to PID %d", (int) his_pid); SendProcSignal(his_pid, PROCSIG_CATCHUP_INTERRUPT, his_backendId); if (callerHasWriteLock) LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); } else { LWLockRelease(SInvalReadLock); if (!callerHasWriteLock) LWLockRelease(SInvalWriteLock); } }
/* * CommitTS resource manager's routines */ void commit_ts_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* Backup blocks are not used in commit_ts records */ Assert(!XLogRecHasAnyBlockRefs(record)); if (info == COMMIT_TS_ZEROPAGE) { int pageno; int slotno; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); LWLockAcquire(CommitTsControlLock, LW_EXCLUSIVE); slotno = ZeroCommitTsPage(pageno, false); SimpleLruWritePage(CommitTsCtl, slotno); Assert(!CommitTsCtl->shared->page_dirty[slotno]); LWLockRelease(CommitTsControlLock); } else if (info == COMMIT_TS_TRUNCATE) { int pageno; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); /* * During XLOG replay, latest_page_number isn't set up yet; insert a * suitable value to bypass the sanity test in SimpleLruTruncate. */ CommitTsCtl->shared->latest_page_number = pageno; SimpleLruTruncate(CommitTsCtl, pageno); } else if (info == COMMIT_TS_SETTS) { xl_commit_ts_set *setts = (xl_commit_ts_set *) XLogRecGetData(record); int nsubxids; TransactionId *subxids; nsubxids = ((XLogRecGetDataLen(record) - SizeOfCommitTsSet) / sizeof(TransactionId)); if (nsubxids > 0) { subxids = palloc(sizeof(TransactionId) * nsubxids); memcpy(subxids, XLogRecGetData(record) + SizeOfCommitTsSet, sizeof(TransactionId) * nsubxids); } else subxids = NULL; TransactionTreeSetCommitTsData(setts->mainxid, nsubxids, subxids, setts->timestamp, setts->nodeid, true); if (subxids) pfree(subxids); } else elog(PANIC, "commit_ts_redo: unknown op code %u", info); }
/* * task_tracker_assign_task creates a new task in the shared hash or updates an * already existing task. The function also creates a schema for the job if it * doesn't already exist. */ Datum task_tracker_assign_task(PG_FUNCTION_ARGS) { uint64 jobId = PG_GETARG_INT64(0); uint32 taskId = PG_GETARG_UINT32(1); text *taskCallStringText = PG_GETARG_TEXT_P(2); StringInfo jobSchemaName = JobSchemaName(jobId); bool schemaExists = false; WorkerTask *workerTask = NULL; char *taskCallString = text_to_cstring(taskCallStringText); uint32 taskCallStringLength = strlen(taskCallString); /* check that we have a running task tracker on this host */ bool taskTrackerRunning = TaskTrackerRunning(); if (!taskTrackerRunning) { ereport(ERROR, (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("the task tracker has been disabled or shut down"))); } /* check that we have enough space in our shared hash for this string */ if (taskCallStringLength >= TASK_CALL_STRING_SIZE) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("task call string exceeds maximum assignable length"))); } /* * If the schema does not exist, we create it. However, the schema does not * become visible to other processes until the transaction commits, and we * therefore do not release the resource lock in this case. Otherwise, the * schema is already visible, and we immediately release the resource lock. */ LockJobResource(jobId, AccessExclusiveLock); schemaExists = JobSchemaExists(jobSchemaName); if (!schemaExists) { /* lock gets automatically released upon return from this function */ CreateJobSchema(jobSchemaName); } else { UnlockJobResource(jobId, AccessExclusiveLock); } LWLockAcquire(&WorkerTasksSharedState->taskHashLock, LW_EXCLUSIVE); /* check if we already have the task in our shared hash */ workerTask = WorkerTasksHashFind(jobId, taskId); if (workerTask == NULL) { CreateTask(jobId, taskId, taskCallString); } else { UpdateTask(workerTask, taskCallString); } LWLockRelease(&WorkerTasksSharedState->taskHashLock); PG_RETURN_VOID(); }
/* * shmem_startup hook: allocate or attach to shared memory, * then load any pre-existing statistics from file. */ static void pgss_shmem_startup(void) { bool found; HASHCTL info; FILE *file; uint32 header; int32 num; int32 i; int query_size; int buffer_size; char *buffer = NULL; if (prev_shmem_startup_hook) prev_shmem_startup_hook(); /* reset in case this is a restart within the postmaster */ pgss = NULL; pgss_hash = NULL; /* * Create or attach to the shared memory state, including hash table */ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); pgss = ShmemInitStruct("pg_stat_statements", sizeof(pgssSharedState), &found); if (!found) { /* First time through ... */ pgss->lock = LWLockAssign(); pgss->query_size = pgstat_track_activity_query_size; } /* Be sure everyone agrees on the hash table entry size */ query_size = pgss->query_size; memset(&info, 0, sizeof(info)); info.keysize = sizeof(pgssHashKey); info.entrysize = offsetof(pgssEntry, query) +query_size; info.hash = pgss_hash_fn; info.match = pgss_match_fn; pgss_hash = ShmemInitHash("pg_stat_statements hash", pgss_max, pgss_max, &info, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); LWLockRelease(AddinShmemInitLock); /* * If we're in the postmaster (or a standalone backend...), set up a shmem * exit hook to dump the statistics to disk. */ if (!IsUnderPostmaster) on_shmem_exit(pgss_shmem_shutdown, (Datum) 0); /* * Attempt to load old statistics from the dump file, if this is the first * time through and we weren't told not to. */ if (found || !pgss_save) return; /* * Note: we don't bother with locks here, because there should be no other * processes running when this code is reached. */ file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R); if (file == NULL) { if (errno == ENOENT) return; /* ignore not-found error */ goto error; } buffer_size = query_size; buffer = (char *) palloc(buffer_size); if (fread(&header, sizeof(uint32), 1, file) != 1 || header != PGSS_FILE_HEADER || fread(&num, sizeof(int32), 1, file) != 1) goto error; for (i = 0; i < num; i++) { pgssEntry temp; pgssEntry *entry; if (fread(&temp, offsetof(pgssEntry, mutex), 1, file) != 1) goto error; /* Encoding is the only field we can easily sanity-check */ if (!PG_VALID_BE_ENCODING(temp.key.encoding)) goto error; /* Previous incarnation might have had a larger query_size */ if (temp.key.query_len >= buffer_size) { buffer = (char *) repalloc(buffer, temp.key.query_len + 1); buffer_size = temp.key.query_len + 1; } if (fread(buffer, 1, temp.key.query_len, file) != temp.key.query_len) goto error; buffer[temp.key.query_len] = '\0'; /* Clip to available length if needed */ if (temp.key.query_len >= query_size) temp.key.query_len = pg_encoding_mbcliplen(temp.key.encoding, buffer, temp.key.query_len, query_size - 1); temp.key.query_ptr = buffer; /* make the hashtable entry (discards old entries if too many) */ entry = entry_alloc(&temp.key); /* copy in the actual stats */ entry->counters = temp.counters; } pfree(buffer); FreeFile(file); return; error: ereport(LOG, (errcode_for_file_access(), errmsg("could not read pg_stat_statement file \"%s\": %m", PGSS_DUMP_FILE))); if (buffer) pfree(buffer); if (file) FreeFile(file); /* If possible, throw away the bogus file; ignore any error */ unlink(PGSS_DUMP_FILE); }
/* * Retrieve statement statistics. */ Datum pg_stat_statements(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; Tuplestorestate *tupstore; MemoryContext per_query_ctx; MemoryContext oldcontext; Oid userid = GetUserId(); bool is_superuser = superuser(); HASH_SEQ_STATUS hash_seq; pgssEntry *entry; if (!pgss || !pgss_hash) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("pg_stat_statements must be loaded via shared_preload_libraries"))); /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not " \ "allowed in this context"))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); LWLockAcquire(pgss->lock, LW_SHARED); hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { Datum values[PG_STAT_STATEMENTS_COLS]; bool nulls[PG_STAT_STATEMENTS_COLS]; int i = 0; Counters tmp; memset(values, 0, sizeof(values)); memset(nulls, 0, sizeof(nulls)); values[i++] = ObjectIdGetDatum(entry->key.userid); values[i++] = ObjectIdGetDatum(entry->key.dbid); if (is_superuser || entry->key.userid == userid) { char *qstr; qstr = (char *) pg_do_encoding_conversion((unsigned char *) entry->query, entry->key.query_len, entry->key.encoding, GetDatabaseEncoding()); values[i++] = CStringGetTextDatum(qstr); if (qstr != entry->query) pfree(qstr); } else values[i++] = CStringGetTextDatum("<insufficient privilege>"); /* copy counters to a local variable to keep locking time short */ { volatile pgssEntry *e = (volatile pgssEntry *) entry; SpinLockAcquire(&e->mutex); tmp = e->counters; SpinLockRelease(&e->mutex); } values[i++] = Int64GetDatumFast(tmp.calls); values[i++] = Float8GetDatumFast(tmp.total_time); values[i++] = Int64GetDatumFast(tmp.rows); values[i++] = Int64GetDatumFast(tmp.shared_blks_hit); values[i++] = Int64GetDatumFast(tmp.shared_blks_read); values[i++] = Int64GetDatumFast(tmp.shared_blks_written); values[i++] = Int64GetDatumFast(tmp.local_blks_hit); values[i++] = Int64GetDatumFast(tmp.local_blks_read); values[i++] = Int64GetDatumFast(tmp.local_blks_written); values[i++] = Int64GetDatumFast(tmp.temp_blks_read); values[i++] = Int64GetDatumFast(tmp.temp_blks_written); Assert(i == PG_STAT_STATEMENTS_COLS); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } LWLockRelease(pgss->lock); /* clean up and return the tuplestore */ tuplestore_donestoring(tupstore); return (Datum) 0; }
/* * Each database using a table space is isolated into its own name space * by a subdirectory named for the database OID. On first creation of an * object in the tablespace, create the subdirectory. If the subdirectory * already exists, fall through quietly. * * isRedo indicates that we are creating an object during WAL replay. * In this case we will cope with the possibility of the tablespace * directory not being there either --- this could happen if we are * replaying an operation on a table in a subsequently-dropped tablespace. * We handle this by making a directory in the place where the tablespace * symlink would normally be. This isn't an exact replay of course, but * it's the best we can do given the available information. * * If tablespaces are not supported, we still need it in case we have to * re-create a database subdirectory (of $PGDATA/base) during WAL replay. */ void TablespaceCreateDbspace(Oid spcNode, Oid dbNode, bool isRedo) { struct stat st; char *dir; /* * The global tablespace doesn't have per-database subdirectories, so * nothing to do for it. */ if (spcNode == GLOBALTABLESPACE_OID) return; Assert(OidIsValid(spcNode)); Assert(OidIsValid(dbNode)); dir = GetDatabasePath(dbNode, spcNode); if (stat(dir, &st) < 0) { /* Directory does not exist? */ if (errno == ENOENT) { /* * Acquire TablespaceCreateLock to ensure that no DROP TABLESPACE * or TablespaceCreateDbspace is running concurrently. */ LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* * Recheck to see if someone created the directory while we were * waiting for lock. */ if (stat(dir, &st) == 0 && S_ISDIR(st.st_mode)) { /* Directory was created */ } else { /* Directory creation failed? */ if (mkdir(dir, S_IRWXU) < 0) { char *parentdir; /* Failure other than not exists or not in WAL replay? */ if (errno != ENOENT || !isRedo) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", dir))); /* * Parent directories are missing during WAL replay, so * continue by creating simple parent directories rather * than a symlink. */ /* create two parents up if not exist */ parentdir = pstrdup(dir); get_parent_directory(parentdir); get_parent_directory(parentdir); /* Can't create parent and it doesn't already exist? */ if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", parentdir))); pfree(parentdir); /* create one parent up if not exist */ parentdir = pstrdup(dir); get_parent_directory(parentdir); /* Can't create parent and it doesn't already exist? */ if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", parentdir))); pfree(parentdir); /* Create database directory */ if (mkdir(dir, S_IRWXU) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", dir))); } } LWLockRelease(TablespaceCreateLock); } else { ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat directory \"%s\": %m", dir))); } } else { /* Is it not a directory? */ if (!S_ISDIR(st.st_mode)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" exists but is not a directory", dir))); } pfree(dir); }
void smgrDoAppendOnlyResyncEofs(bool forCommit) { HASH_SEQ_STATUS iterateStatus; AppendOnlyMirrorResyncEofs *entry; AppendOnlyMirrorResyncEofs *entryExample = NULL; int appendOnlyMirrorResyncEofsCount; if (AppendOnlyMirrorResyncEofsTable == NULL) return; if (Debug_persistent_print || Debug_persistent_appendonly_commit_count_print) elog(Persistent_DebugPrintLevel(), "Storage Manager: Enter Append-Only mirror resync eofs list entries (Append-Only commit work count %d)", FileRepPrimary_GetAppendOnlyCommitWorkCount()); hash_seq_init( &iterateStatus, AppendOnlyMirrorResyncEofsTable); appendOnlyMirrorResyncEofsCount = 0; while ((entry = hash_seq_search(&iterateStatus)) != NULL) { if (entryExample == NULL) { entryExample = entry; } if (forCommit) { PersistentFileSysObj_UpdateAppendOnlyMirrorResyncEofs(&entry->key.relFileNode, entry->key.segmentFileNum, &entry->persistentTid, entry->persistentSerialNum, entry->mirrorCatchupRequired, entry->mirrorNewEof, /* recovery */ false, /* flushToXLog */ false); } else { /* * Abort case. */ if (entry->didIncrementCommitCount) { int systemAppendOnlyCommitWorkCount; LWLockAcquire(FileRepAppendOnlyCommitCountLock, LW_EXCLUSIVE); systemAppendOnlyCommitWorkCount = FileRepPrimary_FinishedAppendOnlyCommitWork(1); if (entry->isDistributedTransaction) { PrepareDecrAppendOnlyCommitWork(entry->gid); } if (Debug_persistent_print || Debug_persistent_appendonly_commit_count_print) elog(Persistent_DebugPrintLevel(), "Storage Manager: Append-Only Mirror Resync EOFs decrementing commit work for aborted transaction " "(system count %d). " "Relation %u/%u/%u, segment file #%d (persistent serial num " INT64_FORMAT ", TID %s) ", systemAppendOnlyCommitWorkCount, entry->key.relFileNode.spcNode, entry->key.relFileNode.dbNode, entry->key.relFileNode.relNode, entry->key.segmentFileNum, entry->persistentSerialNum, ItemPointerToString(&entry->persistentTid)); pendingAppendOnlyMirrorResyncIntentCount--; LWLockRelease(FileRepAppendOnlyCommitCountLock); } } if (Debug_persistent_print || Debug_persistent_appendonly_commit_count_print) elog(Persistent_DebugPrintLevel(), "Storage Manager: Append-Only mirror resync eofs list entry #%d: %u/%u/%u, segment file #%d, relation name '%s' " "(forCommit %s, persistent TID %s, persistent serial number " INT64_FORMAT ", mirror catchup required %s, mirror new EOF " INT64_FORMAT ")", appendOnlyMirrorResyncEofsCount, entry->key.relFileNode.spcNode, entry->key.relFileNode.dbNode, entry->key.relFileNode.relNode, entry->key.segmentFileNum, (entry->relationName == NULL ? "<null>" : entry->relationName), (forCommit ? "true" : "false"), ItemPointerToString(&entry->persistentTid), entry->persistentSerialNum, (entry->mirrorCatchupRequired ? "true" : "false"), entry->mirrorNewEof); appendOnlyMirrorResyncEofsCount++; } /* * If we collected Append-Only mirror resync EOFs and bumped the intent * count, we need to decrement the counts as part of our end transaction * work here. */ if (pendingAppendOnlyMirrorResyncIntentCount > 0) { MIRRORED_LOCK_DECLARE; int oldSystemAppendOnlyCommitWorkCount; int newSystemAppendOnlyCommitWorkCount; int resultSystemAppendOnlyCommitWorkCount; if (appendOnlyMirrorResyncEofsCount != pendingAppendOnlyMirrorResyncIntentCount) elog(ERROR, "Pending Append-Only Mirror Resync EOFs intent count mismatch (pending %d, table count %d)", pendingAppendOnlyMirrorResyncIntentCount, appendOnlyMirrorResyncEofsCount); if (entryExample == NULL) elog(ERROR, "Not expecting an empty Append-Only Mirror Resync hash table when the local intent count is non-zero (%d)", pendingAppendOnlyMirrorResyncIntentCount); MIRRORED_LOCK; /*NOTE: When we use the MirroredLock for the whole routine, this can go. */ LWLockAcquire(FileRepAppendOnlyCommitCountLock, LW_EXCLUSIVE); oldSystemAppendOnlyCommitWorkCount = FileRepPrimary_GetAppendOnlyCommitWorkCount(); newSystemAppendOnlyCommitWorkCount = oldSystemAppendOnlyCommitWorkCount - pendingAppendOnlyMirrorResyncIntentCount; if (newSystemAppendOnlyCommitWorkCount < 0) elog(ERROR, "Append-Only Mirror Resync EOFs intent count would go negative " "(system count %d, entry count %d). " "Example relation %u/%u/%u, segment file #%d (persistent serial num " INT64_FORMAT ", TID %s)", oldSystemAppendOnlyCommitWorkCount, pendingAppendOnlyMirrorResyncIntentCount, entryExample->key.relFileNode.spcNode, entryExample->key.relFileNode.dbNode, entryExample->key.relFileNode.relNode, entryExample->key.segmentFileNum, entryExample->persistentSerialNum, ItemPointerToString(&entryExample->persistentTid)); resultSystemAppendOnlyCommitWorkCount = FileRepPrimary_FinishedAppendOnlyCommitWork(pendingAppendOnlyMirrorResyncIntentCount); /* * Should match since we are under FileRepAppendOnlyCommitCountLock * EXCLUSIVE. */ Assert(newSystemAppendOnlyCommitWorkCount == resultSystemAppendOnlyCommitWorkCount); pendingAppendOnlyMirrorResyncIntentCount = 0; if (Debug_persistent_print || Debug_persistent_appendonly_commit_count_print) elog(Persistent_DebugPrintLevel(), "Storage Manager: Append-Only Mirror Resync EOFs intent count finishing %s work with system count %d remaining " "(enter system count %d, entry count %d, result system count %d). " "Example relation %u/%u/%u, segment file #%d (persistent serial num " INT64_FORMAT ", TID %s)", (forCommit ? "commit" : "abort"), newSystemAppendOnlyCommitWorkCount, oldSystemAppendOnlyCommitWorkCount, pendingAppendOnlyMirrorResyncIntentCount, resultSystemAppendOnlyCommitWorkCount, entryExample->key.relFileNode.spcNode, entryExample->key.relFileNode.dbNode, entryExample->key.relFileNode.relNode, entryExample->key.segmentFileNum, entryExample->persistentSerialNum, ItemPointerToString(&entryExample->persistentTid)); LWLockRelease(FileRepAppendOnlyCommitCountLock); MIRRORED_UNLOCK; } }
/* * Wait for synchronous replication, if requested by user. * * Initially backends start in state SYNC_REP_NOT_WAITING and then * change that state to SYNC_REP_WAITING before adding ourselves * to the wait queue. During SyncRepWakeQueue() a WALSender changes * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed. * This backend then resets its state to SYNC_REP_NOT_WAITING. */ void SyncRepWaitForLSN(XLogRecPtr XactCommitLSN) { char *new_status = NULL; const char *old_status; /* * Fast exit if user has not requested sync replication, or * there are no sync replication standby names defined. * Note that those standbys don't need to be connected. */ if (!SyncRepRequested() || !SyncStandbysDefined()) return; Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); Assert(WalSndCtl != NULL); /* Reset the latch before adding ourselves to the queue. */ ResetLatch(&MyProc->waitLatch); /* * Set our waitLSN so WALSender will know when to wake us, and add * ourselves to the queue. */ LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING); if (!WalSndCtl->sync_standbys_defined) { /* * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is * not set. See SyncRepUpdateSyncStandbysDefined. */ LWLockRelease(SyncRepLock); return; } MyProc->waitLSN = XactCommitLSN; MyProc->syncRepState = SYNC_REP_WAITING; SyncRepQueueInsert(); Assert(SyncRepQueueIsOrderedByLSN()); LWLockRelease(SyncRepLock); /* Alter ps display to show waiting for sync rep. */ if (update_process_title) { int len; old_status = get_ps_display(&len); new_status = (char *) palloc(len + 32 + 1); memcpy(new_status, old_status, len); sprintf(new_status + len, " waiting for %X/%X", XactCommitLSN.xlogid, XactCommitLSN.xrecoff); set_ps_display(new_status, false); new_status[len] = '\0'; /* truncate off " waiting ..." */ } /* * Wait for specified LSN to be confirmed. * * Each proc has its own wait latch, so we perform a normal latch * check/wait loop here. */ for (;;) { int syncRepState; /* * Wait on latch for up to 60 seconds. This allows us to * check for postmaster death regularly while waiting. * Note that timeout here does not necessarily release from loop. */ WaitLatch(&MyProc->waitLatch, 60000000L); /* Must reset the latch before testing state. */ ResetLatch(&MyProc->waitLatch); /* * Try checking the state without the lock first. There's no guarantee * that we'll read the most up-to-date value, so if it looks like we're * still waiting, recheck while holding the lock. But if it looks like * we're done, we must really be done, because once walsender changes * the state to SYNC_REP_WAIT_COMPLETE, it will never update it again, * so we can't be seeing a stale value in that case. */ syncRepState = MyProc->syncRepState; if (syncRepState == SYNC_REP_WAITING) { LWLockAcquire(SyncRepLock, LW_SHARED); syncRepState = MyProc->syncRepState; LWLockRelease(SyncRepLock); } if (syncRepState == SYNC_REP_WAIT_COMPLETE) break; /* * If a wait for synchronous replication is pending, we can neither * acknowledge the commit nor raise ERROR or FATAL. The latter * would lead the client to believe that that the transaction * aborted, which is not true: it's already committed locally. * The former is no good either: the client has requested * synchronous replication, and is entitled to assume that an * acknowledged commit is also replicated, which may not be true. * So in this case we issue a WARNING (which some clients may * be able to interpret) and shut off further output. We do NOT * reset ProcDiePending, so that the process will die after the * commit is cleaned up. */ if (ProcDiePending) { ereport(WARNING, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"), errdetail("The transaction has already committed locally, but may not have been replicated to the standby."))); whereToSendOutput = DestNone; SyncRepCancelWait(); break; } /* * It's unclear what to do if a query cancel interrupt arrives. We * can't actually abort at this point, but ignoring the interrupt * altogether is not helpful, so we just terminate the wait with * a suitable warning. */ if (QueryCancelPending) { QueryCancelPending = false; ereport(WARNING, (errmsg("canceling wait for synchronous replication due to user request"), errdetail("The transaction has already committed locally, but may not have been replicated to the standby."))); SyncRepCancelWait(); break; } /* * If the postmaster dies, we'll probably never get an acknowledgement, * because all the wal sender processes will exit. So just bail out. */ if (!PostmasterIsAlive(true)) { ProcDiePending = true; whereToSendOutput = DestNone; SyncRepCancelWait(); break; } } /* * WalSender has checked our LSN and has removed us from queue. Clean up * state and leave. It's OK to reset these shared memory fields without * holding SyncRepLock, because any walsenders will ignore us anyway when * we're not on the queue. */ Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); MyProc->syncRepState = SYNC_REP_NOT_WAITING; MyProc->waitLSN.xlogid = 0; MyProc->waitLSN.xrecoff = 0; if (new_status) { /* Reset ps display */ set_ps_display(new_status, false); pfree(new_status); } }
Datum dbms_alert_defered_signal(PG_FUNCTION_ARGS) { TriggerData *trigdata = (TriggerData *) fcinfo->context; TupleDesc tupdesc; HeapTuple rettuple; char *relname; text *name; text *message; int event_col; int message_col; Datum datum; bool isnull; int cycle = 0; float8 endtime; float8 timeout = 2; if (!CALLED_AS_TRIGGER(fcinfo)) ereport(ERROR, (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION), errmsg("not called by trigger manager"))); if (!TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) ereport(ERROR, (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION), errmsg("not called on valid event"))); if (SPI_connect() < 0) ereport(ERROR, (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION), errmsg("SPI_connect failed"))); if (strcmp((relname = SPI_getrelname(trigdata->tg_relation)), "ora_alerts") != 0) ereport(ERROR, (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION), errmsg("not called with valid relation"))); rettuple = trigdata->tg_trigtuple; tupdesc = trigdata->tg_relation->rd_att; if (SPI_ERROR_NOATTRIBUTE == (event_col = SPI_fnumber(tupdesc, "event"))) ereport(ERROR, (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION), errmsg("attribute event not found"))); if (SPI_ERROR_NOATTRIBUTE == (message_col = SPI_fnumber(tupdesc, "message"))) ereport(ERROR, (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION), errmsg("attribute message not found"))); datum = SPI_getbinval(rettuple, tupdesc, event_col, &isnull); if (isnull) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("event name is NULL"), errdetail("Eventname may not be NULL."))); name = DatumGetTextP(datum); datum = SPI_getbinval(rettuple, tupdesc, message_col, &isnull); if (isnull) message = NULL; else message = DatumGetTextP(datum); WATCH_PRE(timeout, endtime, cycle); if (ora_lock_shmem(SHMEMMSGSZ, MAX_PIPES, MAX_EVENTS, MAX_LOCKS, false)) { ItemPointer tid; Oid argtypes[1] = {TIDOID}; char nulls[1] = {' '}; Datum values[1]; void *plan; create_message(name, message); LWLockRelease(shmem_lock); tid = &rettuple->t_data->t_ctid; if (!(plan = SPI_prepare("DELETE FROM ora_alerts WHERE ctid = $1", 1, argtypes))) ereport(ERROR, (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION), errmsg("SPI_prepare failed"))); values[0] = ItemPointerGetDatum(tid); if (SPI_OK_DELETE != SPI_execute_plan(plan, values, nulls, false, 1)) ereport(ERROR, (errcode(ERRCODE_TRIGGERED_ACTION_EXCEPTION), errmsg("can't execute sql"))); SPI_finish(); return PointerGetDatum(rettuple); } WATCH_POST(timeout, endtime, cycle); LOCK_ERROR(); PG_RETURN_NULL(); }
/* * Update the LSNs on each queue based upon our latest state. This * implements a simple policy of first-valid-standby-releases-waiter. * * Other policies are possible, which would change what we do here and what * perhaps also which information we store as well. */ void SyncRepReleaseWaiters(void) { volatile WalSndCtlData *walsndctl = WalSndCtl; volatile WalSnd *syncWalSnd = NULL; int numprocs = 0; int priority = 0; int i; /* * If this WALSender is serving a standby that is not on the list of * potential standbys then we have nothing to do. If we are still * starting up or still running base backup, then leave quickly also. */ if (MyWalSnd->sync_standby_priority == 0 || MyWalSnd->state < WALSNDSTATE_STREAMING) return; /* * We're a potential sync standby. Release waiters if we are the * highest priority standby. If there are multiple standbys with * same priorities then we use the first mentioned standby. * If you change this, also change pg_stat_get_wal_senders(). */ LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); for (i = 0; i < max_wal_senders; i++) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = &walsndctl->walsnds[i]; if (walsnd->pid != 0 && walsnd->sync_standby_priority > 0 && (priority == 0 || priority > walsnd->sync_standby_priority)) { priority = walsnd->sync_standby_priority; syncWalSnd = walsnd; } } /* * We should have found ourselves at least. */ Assert(syncWalSnd); /* * If we aren't managing the highest priority standby then just leave. */ if (syncWalSnd != MyWalSnd) { LWLockRelease(SyncRepLock); announce_next_takeover = true; return; } if (XLByteLT(walsndctl->lsn, MyWalSnd->flush)) { /* * Set the lsn first so that when we wake backends they will * release up to this location. */ walsndctl->lsn = MyWalSnd->flush; numprocs = SyncRepWakeQueue(false); } LWLockRelease(SyncRepLock); elog(DEBUG3, "released %d procs up to %X/%X", numprocs, MyWalSnd->flush.xlogid, MyWalSnd->flush.xrecoff); /* * If we are managing the highest priority standby, though we weren't * prior to this, then announce we are now the sync standby. */ if (announce_next_takeover) { announce_next_takeover = false; ereport(LOG, (errmsg("standby \"%s\" is now the synchronous standby with priority %u", application_name, MyWalSnd->sync_standby_priority))); } }
/* * ShmemInitStruct -- Create/attach to a structure in shared memory. * * This is called during initialization to find or allocate * a data structure in shared memory. If no other process * has created the structure, this routine allocates space * for it. If it exists already, a pointer to the existing * structure is returned. * * Returns: pointer to the object. *foundPtr is set TRUE if the object was * already in the shmem index (hence, already initialized). * * Note: before Postgres 9.0, this function returned NULL for some failure * cases. Now, it always throws error instead, so callers need not check * for NULL. */ void * ShmemInitStruct(const char *name, Size size, bool *foundPtr) { ShmemIndexEnt *result; void *structPtr; LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); if (!ShmemIndex) { PGShmemHeader *shmemseghdr = ShmemSegHdr; /* Must be trying to create/attach to ShmemIndex itself */ Assert(strcmp(name, "ShmemIndex") == 0); if (IsUnderPostmaster) { /* Must be initializing a (non-standalone) backend */ Assert(shmemseghdr->index != NULL); structPtr = shmemseghdr->index; *foundPtr = TRUE; } else { /* * If the shmem index doesn't exist, we are bootstrapping: we must * be trying to init the shmem index itself. * * Notice that the ShmemIndexLock is released before the shmem * index has been initialized. This should be OK because no other * process can be accessing shared memory yet. */ Assert(shmemseghdr->index == NULL); structPtr = ShmemAlloc(size); shmemseghdr->index = structPtr; *foundPtr = FALSE; } LWLockRelease(ShmemIndexLock); return structPtr; } /* look it up in the shmem index */ result = (ShmemIndexEnt *) hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr); if (!result) { LWLockRelease(ShmemIndexLock); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("could not create ShmemIndex entry for data structure \"%s\"", name))); } if (*foundPtr) { /* * Structure is in the shmem index so someone else has allocated it * already. The size better be the same as the size we are trying to * initialize to, or there is a name conflict (or worse). */ if (result->size != size) { LWLockRelease(ShmemIndexLock); ereport(ERROR, (errmsg("ShmemIndex entry size is wrong for data structure" " \"%s\": expected %zu, actual %zu", name, size, result->size))); } structPtr = result->location; } else { /* It isn't in the table yet. allocate and initialize it */ structPtr = ShmemAllocNoError(size); if (structPtr == NULL) { /* out of memory; remove the failed ShmemIndex entry */ hash_search(ShmemIndex, name, HASH_REMOVE, NULL); LWLockRelease(ShmemIndexLock); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("not enough shared memory for data structure" " \"%s\" (%zu bytes requested)", name, size))); } result->size = size; result->location = structPtr; } LWLockRelease(ShmemIndexLock); Assert(ShmemAddrIsValid(structPtr)); Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); return structPtr; }
/* * Write a page from a shared buffer, if necessary. * Does nothing if the specified slot is not dirty. * * NOTE: only one write attempt is made here. Hence, it is possible that * the page is still dirty at exit (if someone else re-dirtied it during * the write). However, we *do* attempt a fresh write even if the page * is already being written; this is for checkpoints. * * Control lock must be held at entry, and will be held at exit. */ static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int pageno = shared->page_number[slotno]; bool ok; /* If a write is in progress, wait for it to finish */ while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && shared->page_number[slotno] == pageno) { SimpleLruWaitIO(ctl, slotno); } /* * Do nothing if page is not dirty, or if buffer no longer contains the * same page we were called for. */ if (!shared->page_dirty[slotno] || shared->page_status[slotno] != SLRU_PAGE_VALID || shared->page_number[slotno] != pageno) return; /* * Mark the slot write-busy, and clear the dirtybit. After this point, a * transaction status update on this page will mark it dirty again. */ shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; shared->page_dirty[slotno] = false; /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE); /* Release control lock while doing I/O */ LWLockRelease(shared->ControlLock); /* Do the write */ ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); /* If we failed, and we're in a flush, better close the files */ if (!ok && fdata) { int i; for (i = 0; i < fdata->num_files; i++) close(fdata->fd[i]); } /* Re-acquire control lock and update page state */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); /* If we failed to write, mark the page dirty again */ if (!ok) shared->page_dirty[slotno] = true; shared->page_status[slotno] = SLRU_PAGE_VALID; LWLockRelease(shared->buffer_locks[slotno]); /* Now it's okay to ereport if we failed */ if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); }
/* * Returns activity of walsenders, including pids and xlog locations sent to * standby servers. */ Datum pg_stat_get_wal_senders(PG_FUNCTION_ARGS) { #define PG_STAT_GET_WAL_SENDERS_COLS 8 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; Tuplestorestate *tupstore; MemoryContext per_query_ctx; MemoryContext oldcontext; int *sync_priority; int priority = 0; int sync_standby = -1; int i; /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not " \ "allowed in this context"))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); /* * Get the priorities of sync standbys all in one go, to minimise lock * acquisitions and to allow us to evaluate who is the current sync * standby. This code must match the code in SyncRepReleaseWaiters(). */ sync_priority = palloc(sizeof(int) * max_wal_senders); LWLockAcquire(SyncRepLock, LW_SHARED); for (i = 0; i < max_wal_senders; i++) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = &WalSndCtl->walsnds[i]; if (walsnd->pid != 0) { sync_priority[i] = walsnd->sync_standby_priority; if (walsnd->state == WALSNDSTATE_STREAMING && walsnd->sync_standby_priority > 0 && (priority == 0 || priority > walsnd->sync_standby_priority)) { priority = walsnd->sync_standby_priority; sync_standby = i; } } } LWLockRelease(SyncRepLock); for (i = 0; i < max_wal_senders; i++) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = &WalSndCtl->walsnds[i]; char location[MAXFNAMELEN]; XLogRecPtr sentPtr; XLogRecPtr write; XLogRecPtr flush; XLogRecPtr apply; WalSndState state; Datum values[PG_STAT_GET_WAL_SENDERS_COLS]; bool nulls[PG_STAT_GET_WAL_SENDERS_COLS]; if (walsnd->pid == 0) continue; SpinLockAcquire(&walsnd->mutex); sentPtr = walsnd->sentPtr; state = walsnd->state; write = walsnd->write; flush = walsnd->flush; apply = walsnd->apply; SpinLockRelease(&walsnd->mutex); memset(nulls, 0, sizeof(nulls)); values[0] = Int32GetDatum(walsnd->pid); if (!superuser()) { /* * Only superusers can see details. Other users only get the pid * value to know it's a walsender, but no details. */ MemSet(&nulls[1], true, PG_STAT_GET_WAL_SENDERS_COLS - 1); } else { values[1] = CStringGetTextDatum(WalSndGetStateString(state)); snprintf(location, sizeof(location), "%X/%X", sentPtr.xlogid, sentPtr.xrecoff); values[2] = CStringGetTextDatum(location); if (write.xlogid == 0 && write.xrecoff == 0) nulls[3] = true; snprintf(location, sizeof(location), "%X/%X", write.xlogid, write.xrecoff); values[3] = CStringGetTextDatum(location); if (flush.xlogid == 0 && flush.xrecoff == 0) nulls[4] = true; snprintf(location, sizeof(location), "%X/%X", flush.xlogid, flush.xrecoff); values[4] = CStringGetTextDatum(location); if (apply.xlogid == 0 && apply.xrecoff == 0) nulls[5] = true; snprintf(location, sizeof(location), "%X/%X", apply.xlogid, apply.xrecoff); values[5] = CStringGetTextDatum(location); values[6] = Int32GetDatum(sync_priority[i]); /* * More easily understood version of standby state. This is purely * informational, not different from priority. */ if (sync_priority[i] == 0) values[7] = CStringGetTextDatum("async"); else if (i == sync_standby) values[7] = CStringGetTextDatum("sync"); else values[7] = CStringGetTextDatum("potential"); } tuplestore_putvalues(tupstore, tupdesc, values, nulls); } pfree(sync_priority); /* clean up and return the tuplestore */ tuplestore_donestoring(tupstore); return (Datum) 0; }
int smgrGetAppendOnlyMirrorResyncEofs(EndXactRecKind endXactRecKind, PersistentEndXactAppendOnlyMirrorResyncEofs **ptr) { int nestLevel = GetCurrentTransactionNestLevel(); int nentries; PersistentEndXactAppendOnlyMirrorResyncEofs *rptr; HASH_SEQ_STATUS iterateStatus; AppendOnlyMirrorResyncEofs *entry; int entryIndex; if (endXactRecKind == EndXactRecKind_Abort) { /* * No Append-Only Mirror Resync EOF information needed on abort. */ *ptr = NULL; return 0; } nentries = 0; if (AppendOnlyMirrorResyncEofsTable != NULL) { hash_seq_init(&iterateStatus, AppendOnlyMirrorResyncEofsTable); while ((entry = hash_seq_search(&iterateStatus)) != NULL) { if (entry->key.nestLevel >= nestLevel) nentries++; } } if (nentries == 0) { *ptr = NULL; return 0; } if (Debug_persistent_print || Debug_persistent_appendonly_commit_count_print) elog(Persistent_DebugPrintLevel(), "Storage Manager: Get Append-Only mirror resync eofs list entries (current transaction nest level %d, Append-Only commit work system count %d)", nestLevel, FileRepPrimary_GetAppendOnlyCommitWorkCount()); rptr = (PersistentEndXactAppendOnlyMirrorResyncEofs *) palloc(nentries * sizeof(PersistentEndXactAppendOnlyMirrorResyncEofs)); *ptr = rptr; entryIndex = 0; hash_seq_init(&iterateStatus, AppendOnlyMirrorResyncEofsTable); while ((entry = hash_seq_search(&iterateStatus)) != NULL) { MIRRORED_LOCK_DECLARE; bool returned; int resultSystemAppendOnlyCommitCount; returned = false; if (entry->key.nestLevel >= nestLevel) { MIRRORED_LOCK; MirroredAppendOnly_EndXactCatchup(entryIndex, &entry->key.relFileNode, entry->key.segmentFileNum, entry->key.nestLevel, entry->relationName, &entry->persistentTid, entry->persistentSerialNum, &mirroredLockLocalVars, entry->mirrorCatchupRequired, entry->mirrorDataLossTrackingState, entry->mirrorDataLossTrackingSessionNum, entry->mirrorNewEof); /* * See if the mirror situation for this Append-Only segment file * has changed since we flushed it to disk. */ rptr->relFileNode = entry->key.relFileNode; rptr->segmentFileNum = entry->key.segmentFileNum; rptr->persistentTid = entry->persistentTid; rptr->persistentSerialNum = entry->persistentSerialNum; if (entry->mirrorCatchupRequired) { rptr->mirrorLossEof = INT64CONST(-1); } else { rptr->mirrorLossEof = entry->mirrorNewEof; } rptr->mirrorNewEof = entry->mirrorNewEof; rptr++; returned = true; START_CRIT_SECTION(); LWLockAcquire(FileRepAppendOnlyCommitCountLock, LW_EXCLUSIVE); resultSystemAppendOnlyCommitCount = FileRepPrimary_IntentAppendOnlyCommitWork(); /* Set this inside the Critical Section. */ entry->didIncrementCommitCount = true; if (endXactRecKind == EndXactRecKind_Prepare) { char gid[TMGIDSIZE]; if (!getDistributedTransactionIdentifier(gid)) elog(ERROR, "Unable to obtain gid during prepare"); PrepareIntentAppendOnlyCommitWork(gid); entry->isDistributedTransaction = true; memcpy(entry->gid, gid, TMGIDSIZE); } pendingAppendOnlyMirrorResyncIntentCount++; } else { MIRRORED_LOCK; START_CRIT_SECTION(); LWLockAcquire(FileRepAppendOnlyCommitCountLock, LW_EXCLUSIVE); resultSystemAppendOnlyCommitCount = FileRepPrimary_GetAppendOnlyCommitWorkCount(); } if (Debug_persistent_print || Debug_persistent_appendonly_commit_count_print) { if (entry->relationName == NULL) elog(Persistent_DebugPrintLevel(), "Storage Manager: Get Append-Only mirror resync eofs list entry #%d: %u/%u/%u, segment file #%d " "(returned %s, result system Append-Only commit count %d, transaction nest level %d, persistent TID %s, persistent serial number " INT64_FORMAT ", mirror catchup required %s, mirror new EOF " INT64_FORMAT ")", entryIndex, entry->key.relFileNode.spcNode, entry->key.relFileNode.dbNode, entry->key.relFileNode.relNode, entry->key.segmentFileNum, (returned ? "true" : "false"), resultSystemAppendOnlyCommitCount, entry->key.nestLevel, ItemPointerToString(&entry->persistentTid), entry->persistentSerialNum, (entry->mirrorCatchupRequired ? "true" : "false"), entry->mirrorNewEof); else elog(Persistent_DebugPrintLevel(), "Storage Manager: Get Append-Only mirror resync eofs list entry #%d: %u/%u/%u, segment file #%d, relation name '%s' " "(returned %s, result system Append-Only commit count %d, transaction nest level %d, persistent TID %s, persistent serial number " INT64_FORMAT ", mirror catchup required %s, mirror new EOF " INT64_FORMAT ")", entryIndex, entry->key.relFileNode.spcNode, entry->key.relFileNode.dbNode, entry->key.relFileNode.relNode, entry->key.segmentFileNum, entry->relationName, (returned ? "true" : "false"), resultSystemAppendOnlyCommitCount, entry->key.nestLevel, ItemPointerToString(&entry->persistentTid), entry->persistentSerialNum, (entry->mirrorCatchupRequired ? "true" : "false"), entry->mirrorNewEof); } LWLockRelease(FileRepAppendOnlyCommitCountLock); END_CRIT_SECTION(); MIRRORED_UNLOCK; entryIndex++; } return nentries; }
/* * Hot Standby feedback */ static void ProcessStandbyHSFeedbackMessage(void) { StandbyHSFeedbackMessage reply; TransactionId newxmin = InvalidTransactionId; pq_copymsgbytes(&reply_message, (char *) &reply, sizeof(StandbyHSFeedbackMessage)); elog(DEBUG2, "hot standby feedback xmin %u epoch %u", reply.xmin, reply.epoch); /* * Update the WalSender's proc xmin to allow it to be visible to * snapshots. This will hold back the removal of dead rows and thereby * prevent the generation of cleanup conflicts on the standby server. */ if (TransactionIdIsValid(reply.xmin)) { TransactionId nextXid; uint32 nextEpoch; bool epochOK = false; GetNextXidAndEpoch(&nextXid, &nextEpoch); /* * Epoch of oldestXmin should be same as standby or if the counter has * wrapped, then one less than reply. */ if (reply.xmin <= nextXid) { if (reply.epoch == nextEpoch) epochOK = true; } else { if (nextEpoch > 0 && reply.epoch == nextEpoch - 1) epochOK = true; } /* * Feedback from standby must not go backwards, nor should it go * forwards further than our most recent xid. */ if (epochOK && TransactionIdPrecedesOrEquals(reply.xmin, nextXid)) { if (!TransactionIdIsValid(MyProc->xmin)) { TransactionId oldestXmin = GetOldestXmin(true, true); if (TransactionIdPrecedes(oldestXmin, reply.xmin)) newxmin = reply.xmin; else newxmin = oldestXmin; } else { if (TransactionIdPrecedes(MyProc->xmin, reply.xmin)) newxmin = reply.xmin; else newxmin = MyProc->xmin; /* stay the same */ } } } /* * Grab the ProcArrayLock to set xmin, or invalidate for bad reply */ if (MyProc->xmin != newxmin) { LWLockAcquire(ProcArrayLock, LW_SHARED); MyProc->xmin = newxmin; LWLockRelease(ProcArrayLock); } }
/* * CheckPointTwoPhase -- handle 2PC component of checkpointing. * * We must fsync the state file of any GXACT that is valid and has a PREPARE * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or * has a later LSN, this checkpoint is not responsible for fsyncing it.) * * This is deliberately run as late as possible in the checkpoint sequence, * because GXACTs ordinarily have short lifespans, and so it is quite * possible that GXACTs that were valid at checkpoint start will no longer * exist if we wait a little bit. * * If a GXACT remains valid across multiple checkpoints, it'll be fsynced * each time. This is considered unusual enough that we don't bother to * expend any extra code to avoid the redundant fsyncs. (They should be * reasonably cheap anyway, since they won't cause I/O.) */ void CheckPointTwoPhase(XLogRecPtr redo_horizon) { TransactionId *xids; int nxids; char path[MAXPGPATH]; int i; /* * We don't want to hold the TwoPhaseStateLock while doing I/O, so we grab * it just long enough to make a list of the XIDs that require fsyncing, * and then do the I/O afterwards. * * This approach creates a race condition: someone else could delete a * GXACT between the time we release TwoPhaseStateLock and the time we try * to open its state file. We handle this by special-casing ENOENT * failures: if we see that, we verify that the GXACT is no longer valid, * and if so ignore the failure. */ if (max_prepared_xacts <= 0) return; /* nothing to do */ TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START(); xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId)); nxids = 0; LWLockAcquire(TwoPhaseStateLock, LW_SHARED); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; if (gxact->valid && XLByteLE(gxact->prepare_lsn, redo_horizon)) xids[nxids++] = gxact->proc.xid; } LWLockRelease(TwoPhaseStateLock); for (i = 0; i < nxids; i++) { TransactionId xid = xids[i]; int fd; TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); if (fd < 0) { if (errno == ENOENT) { /* OK if gxact is no longer valid */ if (!TransactionIdIsPrepared(xid)) continue; /* Restore errno in case it was changed */ errno = ENOENT; } ereport(ERROR, (errcode_for_file_access(), errmsg("could not open two-phase state file \"%s\": %m", path))); } if (pg_fsync(fd) != 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync two-phase state file \"%s\": %m", path))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file \"%s\": %m", path))); } pfree(xids); TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE(); }
/* * Store some statistics for a statement. */ static void pgss_store(const char *query, double total_time, uint64 rows, const BufferUsage *bufusage) { pgssHashKey key; double usage; pgssEntry *entry; Assert(query != NULL); /* Safety check... */ if (!pgss || !pgss_hash) return; /* Set up key for hashtable search */ key.userid = GetUserId(); key.dbid = MyDatabaseId; key.encoding = GetDatabaseEncoding(); key.query_len = strlen(query); if (key.query_len >= pgss->query_size) key.query_len = pg_encoding_mbcliplen(key.encoding, query, key.query_len, pgss->query_size - 1); key.query_ptr = query; usage = USAGE_EXEC(duration); /* Lookup the hash table entry with shared lock. */ LWLockAcquire(pgss->lock, LW_SHARED); entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL); if (!entry) { /* Must acquire exclusive lock to add a new entry. */ LWLockRelease(pgss->lock); LWLockAcquire(pgss->lock, LW_EXCLUSIVE); entry = entry_alloc(&key); } /* Grab the spinlock while updating the counters. */ { volatile pgssEntry *e = (volatile pgssEntry *) entry; SpinLockAcquire(&e->mutex); e->counters.calls += 1; e->counters.total_time += total_time; e->counters.rows += rows; e->counters.shared_blks_hit += bufusage->shared_blks_hit; e->counters.shared_blks_read += bufusage->shared_blks_read; e->counters.shared_blks_written += bufusage->shared_blks_written; e->counters.local_blks_hit += bufusage->local_blks_hit; e->counters.local_blks_read += bufusage->local_blks_read; e->counters.local_blks_written += bufusage->local_blks_written; e->counters.temp_blks_read += bufusage->temp_blks_read; e->counters.temp_blks_written += bufusage->temp_blks_written; e->counters.usage += usage; SpinLockRelease(&e->mutex); } LWLockRelease(pgss->lock); }
/* * MarkAsPreparing * Reserve the GID for the given transaction. * * Internally, this creates a gxact struct and puts it into the active array. * NOTE: this is also used when reloading a gxact after a crash; so avoid * assuming that we can use very much backend context. */ GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid) { GlobalTransaction gxact; int i; if (strlen(gid) >= GIDSIZE) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("transaction identifier \"%s\" is too long", gid))); /* fail immediately if feature is disabled */ if (max_prepared_xacts == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("prepared transactions are disabled"), errhint("Set max_prepared_transactions to a nonzero value."))); LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); /* * First, find and recycle any gxacts that failed during prepare. We do * this partly to ensure we don't mistakenly say their GIDs are still * reserved, and partly so we don't fail on out-of-slots unnecessarily. */ for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { gxact = TwoPhaseState->prepXacts[i]; if (!gxact->valid && !TransactionIdIsActive(gxact->locking_xid)) { /* It's dead Jim ... remove from the active array */ TwoPhaseState->numPrepXacts--; TwoPhaseState->prepXacts[i] = TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts]; /* and put it back in the freelist */ gxact->proc.links.next = (SHM_QUEUE *) TwoPhaseState->freeGXacts; TwoPhaseState->freeGXacts = gxact; /* Back up index count too, so we don't miss scanning one */ i--; } } /* Check for conflicting GID */ for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { gxact = TwoPhaseState->prepXacts[i]; if (strcmp(gxact->gid, gid) == 0) { ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("transaction identifier \"%s\" is already in use", gid))); } } /* Get a free gxact from the freelist */ if (TwoPhaseState->freeGXacts == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("maximum number of prepared transactions reached"), errhint("Increase max_prepared_transactions (currently %d).", max_prepared_xacts))); gxact = TwoPhaseState->freeGXacts; TwoPhaseState->freeGXacts = (GlobalTransaction) gxact->proc.links.next; /* Initialize it */ MemSet(&gxact->proc, 0, sizeof(PGPROC)); SHMQueueElemInit(&(gxact->proc.links)); gxact->proc.waitStatus = STATUS_OK; /* We set up the gxact's VXID as InvalidBackendId/XID */ gxact->proc.lxid = (LocalTransactionId) xid; gxact->proc.xid = xid; gxact->proc.xmin = InvalidTransactionId; gxact->proc.pid = 0; gxact->proc.backendId = InvalidBackendId; gxact->proc.databaseId = databaseid; gxact->proc.roleId = owner; gxact->proc.inCommit = false; gxact->proc.vacuumFlags = 0; gxact->proc.lwWaiting = false; gxact->proc.lwExclusive = false; gxact->proc.lwWaitLink = NULL; gxact->proc.waitLock = NULL; gxact->proc.waitProcLock = NULL; for (i = 0; i < NUM_LOCK_PARTITIONS; i++) SHMQueueInit(&(gxact->proc.myProcLocks[i])); /* subxid data must be filled later by GXactLoadSubxactData */ gxact->proc.subxids.overflowed = false; gxact->proc.subxids.nxids = 0; gxact->prepared_at = prepared_at; /* initialize LSN to 0 (start of WAL) */ gxact->prepare_lsn.xlogid = 0; gxact->prepare_lsn.xrecoff = 0; gxact->owner = owner; gxact->locking_xid = xid; gxact->valid = false; strcpy(gxact->gid, gid); /* And insert it into the active array */ Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts); TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact; LWLockRelease(TwoPhaseStateLock); return gxact; }
/* * Interrogate the commit timestamp of a transaction. * * The return value indicates whether a commit timestamp record was found for * the given xid. The timestamp value is returned in *ts (which may not be * null), and the origin node for the Xid is returned in *nodeid, if it's not * null. */ bool TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, RepOriginId *nodeid) { int pageno = TransactionIdToCTsPage(xid); int entryno = TransactionIdToCTsEntry(xid); int slotno; CommitTimestampEntry entry; TransactionId oldestCommitTs; TransactionId newestCommitTs; /* error if the given Xid doesn't normally commit */ if (!TransactionIdIsNormal(xid)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot retrieve commit timestamp for transaction %u", xid))); LWLockAcquire(CommitTsLock, LW_SHARED); /* Error if module not enabled */ if (!commitTsShared->commitTsActive) error_commit_ts_disabled(); /* * If we're asked for the cached value, return that. Otherwise, fall * through to read from SLRU. */ if (commitTsShared->xidLastCommit == xid) { *ts = commitTsShared->dataLastCommit.time; if (nodeid) *nodeid = commitTsShared->dataLastCommit.nodeid; LWLockRelease(CommitTsLock); return *ts != 0; } oldestCommitTs = ShmemVariableCache->oldestCommitTs; newestCommitTs = ShmemVariableCache->newestCommitTs; /* neither is invalid, or both are */ Assert(TransactionIdIsValid(oldestCommitTs) == TransactionIdIsValid(newestCommitTs)); LWLockRelease(CommitTsLock); /* * Return empty if the requested value is outside our valid range. */ if (!TransactionIdIsValid(oldestCommitTs) || TransactionIdPrecedes(xid, oldestCommitTs) || TransactionIdPrecedes(newestCommitTs, xid)) { *ts = 0; if (nodeid) *nodeid = InvalidRepOriginId; return false; } /* lock is acquired by SimpleLruReadPage_ReadOnly */ slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid); memcpy(&entry, CommitTsCtl->shared->page_buffer[slotno] + SizeOfCommitTimestampEntry * entryno, SizeOfCommitTimestampEntry); *ts = entry.time; if (nodeid) *nodeid = entry.nodeid; LWLockRelease(CommitTsControlLock); return *ts != 0; }
/* * LockGXact * Locate the prepared transaction and mark it busy for COMMIT or PREPARE. */ static GlobalTransaction LockGXact(const char *gid, Oid user) { int i; LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; /* Ignore not-yet-valid GIDs */ if (!gxact->valid) continue; if (strcmp(gxact->gid, gid) != 0) continue; /* Found it, but has someone else got it locked? */ if (TransactionIdIsValid(gxact->locking_xid)) { if (TransactionIdIsActive(gxact->locking_xid)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("prepared transaction with identifier \"%s\" is busy", gid))); gxact->locking_xid = InvalidTransactionId; } if (user != gxact->owner && !superuser_arg(user)) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied to finish prepared transaction"), errhint("Must be superuser or the user that prepared the transaction."))); /* * Note: it probably would be possible to allow committing from * another database; but at the moment NOTIFY is known not to work and * there may be some other issues as well. Hence disallow until * someone gets motivated to make it work. */ if (MyDatabaseId != gxact->proc.databaseId) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("prepared transaction belongs to another database"), errhint("Connect to the database where the transaction was prepared to finish it."))); /* OK for me to lock it */ gxact->locking_xid = GetTopTransactionId(); LWLockRelease(TwoPhaseStateLock); return gxact; } LWLockRelease(TwoPhaseStateLock); ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("prepared transaction with identifier \"%s\" does not exist", gid))); /* NOTREACHED */ return NULL; }
void PersistentTablespace_AddMirrorAll( int16 pridbid, int16 mirdbid) { TablespaceDirEntry tablespaceDirEntry; HASH_SEQ_STATUS hstat; WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; hash_seq_init(&hstat, persistentTablespaceSharedHashTable); if (Persistent_BeforePersistenceWork()) elog(ERROR, "persistent table changes forbidden"); PersistentTablespace_VerifyInitScan(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; LWLockAcquire(TablespaceHashLock, LW_SHARED); while ((tablespaceDirEntry = hash_seq_search(&hstat)) != NULL) { PersistentFileSysObjName fsObjName; Oid tblspc = tablespaceDirEntry->key.tablespaceOid; ItemPointerData persistentTid; uint64 persistentSerialNum; tablespaceDirEntry = PersistentTablespace_FindEntryUnderLock(tblspc); if (tablespaceDirEntry == NULL) elog(ERROR, "Did not find persistent tablespace entry %u", tblspc); persistentSerialNum = tablespaceDirEntry->persistentSerialNum; ItemPointerCopy(&tablespaceDirEntry->persistentTid, &persistentTid); /* * We release TablespaceHashLock in the middle of the loop and * re-acquire it after doing persistent table change. This is needed * to prevent holding the lock for any purpose other than to protect * the tablespace shared hash table. Not releasing this lock could * result in file I/O and potential deadlock due to other LW locks * being acquired in the process. Releasing the lock this way is safe * because we are still holding PersistentObjLock in exclusive mode. * Any change to the filespace shared hash table is also protected by * PersistentObjLock. */ LWLockRelease(TablespaceHashLock); PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tblspc); PersistentFileSysObj_AddMirror(&fsObjName, &persistentTid, persistentSerialNum, pridbid, mirdbid, NULL, true, /* flushToXlog */ false); LWLockAcquire(TablespaceHashLock, LW_SHARED); } LWLockRelease(TablespaceHashLock); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; }
/* * StrategyGetBuffer * * Called by the bufmgr to get the next candidate buffer to use in * BufferAlloc(). The only hard requirement BufferAlloc() has is that * the selected buffer must not currently be pinned by anyone. * * strategy is a BufferAccessStrategy object, or NULL for default strategy. * * To ensure that no one else can pin the buffer before we do, we must * return the buffer with the buffer header spinlock still held. If * *lock_held is set on exit, we have returned with the BufFreelistLock * still held, as well; the caller must release that lock once the spinlock * is dropped. We do it that way because releasing the BufFreelistLock * might awaken other processes, and it would be bad to do the associated * kernel calls while holding the buffer header spinlock. */ volatile BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held) { volatile BufferDesc *buf; Latch *bgwriterLatch; int trycounter; /* * If given a strategy object, see whether it can select a buffer. We * assume strategy objects don't need the BufFreelistLock. */ if (strategy != NULL) { buf = GetBufferFromRing(strategy); if (buf != NULL) { *lock_held = false; return buf; } } /* Nope, so lock the freelist */ *lock_held = true; LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); /* * We count buffer allocation requests so that the bgwriter can estimate * the rate of buffer consumption. Note that buffers recycled by a * strategy object are intentionally not counted here. */ StrategyControl->numBufferAllocs++; /* * If bgwriterLatch is set, we need to waken the bgwriter, but we should * not do so while holding BufFreelistLock; so release and re-grab. This * is annoyingly tedious, but it happens at most once per bgwriter cycle, * so the performance hit is minimal. */ bgwriterLatch = StrategyControl->bgwriterLatch; if (bgwriterLatch) { StrategyControl->bgwriterLatch = NULL; LWLockRelease(BufFreelistLock); SetLatch(bgwriterLatch); LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); } /* * Try to get a buffer from the freelist. Note that the freeNext fields * are considered to be protected by the BufFreelistLock not the * individual buffer spinlocks, so it's OK to manipulate them without * holding the spinlock. */ while (StrategyControl->firstFreeBuffer >= 0) { buf = &BufferDescriptors[StrategyControl->firstFreeBuffer]; Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); /* Unconditionally remove buffer from freelist */ StrategyControl->firstFreeBuffer = buf->freeNext; buf->freeNext = FREENEXT_NOT_IN_LIST; /* * If the buffer is pinned or has a nonzero usage_count, we cannot use * it; discard it and retry. (This can only happen if VACUUM put a * valid buffer in the freelist and then someone else used it before * we got to it. It's probably impossible altogether as of 8.3, but * we'd better check anyway.) */ LockBufHdr(buf); if (buf->refcount == 0 && buf->usage_count == 0) { if (strategy != NULL) AddBufferToRing(strategy, buf); return buf; } UnlockBufHdr(buf); } if(true) { /* Now using LRU buffer replacement policy */ int currMinTimeStamp = INT_MAX; BufferStrategyControl *currTargetBuffer; for (StrategyControl->nextVictimBuffer=0;StrategyControl->nextVictimBuffer<NBuffers;StrategyControl->nextVictimBuffer++) { buf = &BufferDescriptors[StrategyControl->nextVictimBuffer]; if(StrategyControl->nextVictimBuffer == NBuffers-1 && currMinTimeStamp == INT_MAX) { StrategyControl ->completePasses++; elog(ERROR,"No available buffer frame"); } LockBufHdr(buf); if(buf->refcount==0 && buf->timer<currMinTimeStamp) { currMinTimeStamp = buf->timer; currTargetBuffer = buf; } UnlockBufHdr(buf); } return currTargetBuffer; } else { /* Nothing on the freelist, so run the "clock sweep" algorithm */ trycounter = NBuffers; for (;;) { buf = &BufferDescriptors[StrategyControl->nextVictimBuffer]; if (++StrategyControl->nextVictimBuffer >= NBuffers) { StrategyControl->nextVictimBuffer = 0; StrategyControl->completePasses++; } /* * If the buffer is pinned or has a nonzero usage_count, we cannot use * it; decrement the usage_count (unless pinned) and keep scanning. */ LockBufHdr(buf); if (buf->refcount == 0) { if (buf->usage_count > 0) { buf->usage_count--; trycounter = NBuffers; } else { /* Found a usable buffer */ if (strategy != NULL) AddBufferToRing(strategy, buf); return buf; } } else if (--trycounter == 0) { /* * We've scanned all the buffers without making any state changes, * so all the buffers are pinned (or were when we looked at them). * We could hope that someone will free one eventually, but it's * probably better to fail than to risk getting stuck in an * infinite loop. */ UnlockBufHdr(buf); elog(ERROR, "no unpinned buffers available"); } UnlockBufHdr(buf); } /* not reached */ return NULL; } }
/* * SharedInvalBackendInit * Initialize a new backend to operate on the sinval buffer */ void SharedInvalBackendInit(bool sendOnly) { int index; ProcState *stateP = NULL; SISeg *segP = shmInvalBuffer; /* * This can run in parallel with read operations, and for that matter with * write operations; but not in parallel with additions and removals of * backends, nor in parallel with SICleanupQueue. It doesn't seem worth * having a third lock, so we choose to use SInvalWriteLock to serialize * additions/removals. */ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); /* Look for a free entry in the procState array */ for (index = 0; index < segP->lastBackend; index++) { if (segP->procState[index].procPid == 0) /* inactive slot? */ { stateP = &segP->procState[index]; break; } } if (stateP == NULL) { if (segP->lastBackend < segP->maxBackends) { stateP = &segP->procState[segP->lastBackend]; Assert(stateP->procPid == 0); segP->lastBackend++; } else { /* * out of procState slots: MaxBackends exceeded -- report normally */ MyBackendId = InvalidBackendId; LWLockRelease(SInvalWriteLock); ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("sorry, too many clients already"))); } } MyBackendId = (stateP - &segP->procState[0]) + 1; /* Advertise assigned backend ID in MyProc */ MyProc->backendId = MyBackendId; /* Fetch next local transaction ID into local memory */ nextLocalTransactionId = stateP->nextLXID; /* mark myself active, with all extant messages already read */ stateP->procPid = MyProcPid; stateP->nextMsgNum = segP->maxMsgNum; stateP->resetState = false; stateP->signaled = false; stateP->sendOnly = sendOnly; LWLockRelease(SInvalWriteLock); /* register exit routine to mark my entry inactive at exit */ on_shmem_exit(CleanupInvalidationState, PointerGetDatum(segP)); elog(DEBUG4, "my backend id is %d", MyBackendId); }
/* * CheckDeadLock * * We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a * lock to be released by some other process. Check if there's a deadlock; if * not, just return. (But signal ProcSleep to log a message, if * log_lock_waits is true.) If we have a real deadlock, remove ourselves from * the lock's wait queue and signal an error to ProcSleep. */ static void CheckDeadLock(void) { int i; /* * Acquire exclusive lock on the entire shared lock data structures. Must * grab LWLocks in partition-number order to avoid LWLock deadlock. * * Note that the deadlock check interrupt had better not be enabled * anywhere that this process itself holds lock partition locks, else this * will wait forever. Also note that LWLockAcquire creates a critical * section, so that this routine cannot be interrupted by cancel/die * interrupts. */ for (i = 0; i < NUM_LOCK_PARTITIONS; i++) LWLockAcquire(LockHashPartitionLockByIndex(i), LW_EXCLUSIVE); /* * Check to see if we've been awoken by anyone in the interim. * * If we have, we can return and resume our transaction -- happy day. * Before we are awoken the process releasing the lock grants it to us so * we know that we don't have to wait anymore. * * We check by looking to see if we've been unlinked from the wait queue. * This is quicker than checking our semaphore's state, since no kernel * call is needed, and it is safe because we hold the lock partition lock. */ if (MyProc->links.prev == NULL || MyProc->links.next == NULL) goto check_done; #ifdef LOCK_DEBUG if (Debug_deadlocks) DumpAllLocks(); #endif /* Run the deadlock check, and set deadlock_state for use by ProcSleep */ deadlock_state = DeadLockCheck(MyProc); if (deadlock_state == DS_HARD_DEADLOCK) { /* * Oops. We have a deadlock. * * Get this process out of wait state. (Note: we could do this more * efficiently by relying on lockAwaited, but use this coding to * preserve the flexibility to kill some other transaction than the * one detecting the deadlock.) * * RemoveFromWaitQueue sets MyProc->waitStatus to STATUS_ERROR, so * ProcSleep will report an error after we return from the signal * handler. */ Assert(MyProc->waitLock != NULL); RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag))); /* * We're done here. Transaction abort caused by the error that * ProcSleep will raise will cause any other locks we hold to be * released, thus allowing other processes to wake up; we don't need * to do that here. NOTE: an exception is that releasing locks we * hold doesn't consider the possibility of waiters that were blocked * behind us on the lock we just failed to get, and might now be * wakable because we're not in front of them anymore. However, * RemoveFromWaitQueue took care of waking up any such processes. */ } /* * And release locks. We do this in reverse order for two reasons: (1) * Anyone else who needs more than one of the locks will be trying to lock * them in increasing order; we don't want to release the other process * until it can get all the locks it needs. (2) This avoids O(N^2) * behavior inside LWLockRelease. */ check_done: for (i = NUM_LOCK_PARTITIONS; --i >= 0;) LWLockRelease(LockHashPartitionLockByIndex(i)); }
/* * SIGetDataEntries * get next SI message(s) for current backend, if there are any * * Possible return values: * 0: no SI message available * n>0: next n SI messages have been extracted into data[] * -1: SI reset message extracted * * If the return value is less than the array size "datasize", the caller * can assume that there are no more SI messages after the one(s) returned. * Otherwise, another call is needed to collect more messages. * * NB: this can run in parallel with other instances of SIGetDataEntries * executing on behalf of other backends, since each instance will modify only * fields of its own backend's ProcState, and no instance will look at fields * of other backends' ProcStates. We express this by grabbing SInvalReadLock * in shared mode. Note that this is not exactly the normal (read-only) * interpretation of a shared lock! Look closely at the interactions before * allowing SInvalReadLock to be grabbed in shared mode for any other reason! * * NB: this can also run in parallel with SIInsertDataEntries. It is not * guaranteed that we will return any messages added after the routine is * entered. * * Note: we assume that "datasize" is not so large that it might be important * to break our hold on SInvalReadLock into segments. */ int SIGetDataEntries(SharedInvalidationMessage *data, int datasize) { SISeg *segP; ProcState *stateP; int max; int n; LWLockAcquire(SInvalReadLock, LW_SHARED); segP = shmInvalBuffer; stateP = &segP->procState[MyBackendId - 1]; /* Fetch current value of maxMsgNum using spinlock */ { /* use volatile pointer to prevent code rearrangement */ volatile SISeg *vsegP = segP; SpinLockAcquire(&vsegP->msgnumLock); max = vsegP->maxMsgNum; SpinLockRelease(&vsegP->msgnumLock); } if (stateP->resetState) { /* * Force reset. We can say we have dealt with any messages added * since the reset, as well; and that means we should clear the * signaled flag, too. */ stateP->nextMsgNum = max; stateP->resetState = false; stateP->signaled = false; LWLockRelease(SInvalReadLock); return -1; } /* * Retrieve messages and advance backend's counter, until data array is * full or there are no more messages. * * There may be other backends that haven't read the message(s), so we * cannot delete them here. SICleanupQueue() will eventually remove them * from the queue. */ n = 0; while (n < datasize && stateP->nextMsgNum < max) { data[n++] = segP->buffer[stateP->nextMsgNum % MAXNUMMESSAGES]; stateP->nextMsgNum++; } /* * Reset our "signaled" flag whenever we have caught up completely. */ if (stateP->nextMsgNum >= max) stateP->signaled = false; LWLockRelease(SInvalReadLock); return n; }
/* * ProcSleep -- put a process to sleep on the specified lock * * Caller must have set MyProc->heldLocks to reflect locks already held * on the lockable object by this process (under all XIDs). * * The lock table's partition lock must be held at entry, and will be held * at exit. * * Result: STATUS_OK if we acquired the lock, STATUS_ERROR if not (deadlock). * * ASSUME: that no one will fiddle with the queue until after * we release the partition lock. * * NOTES: The process queue is now a priority queue for locking. */ int ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) { LOCKMODE lockmode = locallock->tag.mode; LOCK *lock = locallock->lock; PROCLOCK *proclock = locallock->proclock; uint32 hashcode = locallock->hashcode; LWLock *partitionLock = LockHashPartitionLock(hashcode); PROC_QUEUE *waitQueue = &(lock->waitProcs); LOCKMASK myHeldLocks = MyProc->heldLocks; bool early_deadlock = false; bool allow_autovacuum_cancel = true; int myWaitStatus; PGPROC *proc; int i; /* * Determine where to add myself in the wait queue. * * Normally I should go at the end of the queue. However, if I already * hold locks that conflict with the request of any previous waiter, put * myself in the queue just in front of the first such waiter. This is not * a necessary step, since deadlock detection would move me to before that * waiter anyway; but it's relatively cheap to detect such a conflict * immediately, and avoid delaying till deadlock timeout. * * Special case: if I find I should go in front of some waiter, check to * see if I conflict with already-held locks or the requests before that * waiter. If not, then just grant myself the requested lock immediately. * This is the same as the test for immediate grant in LockAcquire, except * we are only considering the part of the wait queue before my insertion * point. */ if (myHeldLocks != 0) { LOCKMASK aheadRequests = 0; proc = (PGPROC *) waitQueue->links.next; for (i = 0; i < waitQueue->size; i++) { /* Must he wait for me? */ if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks) { /* Must I wait for him ? */ if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks) { /* * Yes, so we have a deadlock. Easiest way to clean up * correctly is to call RemoveFromWaitQueue(), but we * can't do that until we are *on* the wait queue. So, set * a flag to check below, and break out of loop. Also, * record deadlock info for later message. */ RememberSimpleDeadLock(MyProc, lockmode, lock, proc); early_deadlock = true; break; } /* I must go before this waiter. Check special case. */ if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 && LockCheckConflicts(lockMethodTable, lockmode, lock, proclock) == STATUS_OK) { /* Skip the wait and just grant myself the lock. */ GrantLock(lock, proclock, lockmode); GrantAwaitedLock(); return STATUS_OK; } /* Break out of loop to put myself before him */ break; } /* Nope, so advance to next waiter */ aheadRequests |= LOCKBIT_ON(proc->waitLockMode); proc = (PGPROC *) proc->links.next; } /* * If we fall out of loop normally, proc points to waitQueue head, so * we will insert at tail of queue as desired. */ } else { /* I hold no locks, so I can't push in front of anyone. */ proc = (PGPROC *) &(waitQueue->links); } /* * Insert self into queue, ahead of the given proc (or at tail of queue). */ SHMQueueInsertBefore(&(proc->links), &(MyProc->links)); waitQueue->size++; lock->waitMask |= LOCKBIT_ON(lockmode); /* Set up wait information in PGPROC object, too */ MyProc->waitLock = lock; MyProc->waitProcLock = proclock; MyProc->waitLockMode = lockmode; MyProc->waitStatus = STATUS_WAITING; /* * If we detected deadlock, give up without waiting. This must agree with * CheckDeadLock's recovery code, except that we shouldn't release the * semaphore since we haven't tried to lock it yet. */ if (early_deadlock) { RemoveFromWaitQueue(MyProc, hashcode); return STATUS_ERROR; } /* mark that we are waiting for a lock */ lockAwaited = locallock; /* * Release the lock table's partition lock. * * NOTE: this may also cause us to exit critical-section state, possibly * allowing a cancel/die interrupt to be accepted. This is OK because we * have recorded the fact that we are waiting for a lock, and so * LockErrorCleanup will clean up if cancel/die happens. */ LWLockRelease(partitionLock); /* * Also, now that we will successfully clean up after an ereport, it's * safe to check to see if there's a buffer pin deadlock against the * Startup process. Of course, that's only necessary if we're doing Hot * Standby and are not the Startup process ourselves. */ if (RecoveryInProgress() && !InRecovery) CheckRecoveryConflictDeadlock(); /* Reset deadlock_state before enabling the timeout handler */ deadlock_state = DS_NOT_YET_CHECKED; got_deadlock_timeout = false; /* * Set timer so we can wake up after awhile and check for a deadlock. If a * deadlock is detected, the handler releases the process's semaphore and * sets MyProc->waitStatus = STATUS_ERROR, allowing us to know that we * must report failure rather than success. * * By delaying the check until we've waited for a bit, we can avoid * running the rather expensive deadlock-check code in most cases. * * If LockTimeout is set, also enable the timeout for that. We can save a * few cycles by enabling both timeout sources in one call. */ if (LockTimeout > 0) { EnableTimeoutParams timeouts[2]; timeouts[0].id = DEADLOCK_TIMEOUT; timeouts[0].type = TMPARAM_AFTER; timeouts[0].delay_ms = DeadlockTimeout; timeouts[1].id = LOCK_TIMEOUT; timeouts[1].type = TMPARAM_AFTER; timeouts[1].delay_ms = LockTimeout; enable_timeouts(timeouts, 2); } else enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout); /* * If somebody wakes us between LWLockRelease and WaitLatch, the latch * will not wait. But a set latch does not necessarily mean that the lock * is free now, as there are many other sources for latch sets than * somebody releasing the lock. * * We process interrupts whenever the latch has been set, so cancel/die * interrupts are processed quickly. This means we must not mind losing * control to a cancel/die interrupt here. We don't, because we have no * shared-state-change work to do after being granted the lock (the * grantor did it all). We do have to worry about canceling the deadlock * timeout and updating the locallock table, but if we lose control to an * error, LockErrorCleanup will fix that up. */ do { WaitLatch(MyLatch, WL_LATCH_SET, 0); ResetLatch(MyLatch); /* check for deadlocks first, as that's probably log-worthy */ if (got_deadlock_timeout) { CheckDeadLock(); got_deadlock_timeout = false; } CHECK_FOR_INTERRUPTS(); /* * waitStatus could change from STATUS_WAITING to something else * asynchronously. Read it just once per loop to prevent surprising * behavior (such as missing log messages). */ myWaitStatus = *((volatile int *) &MyProc->waitStatus); /* * If we are not deadlocked, but are waiting on an autovacuum-induced * task, send a signal to interrupt it. */ if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel) { PGPROC *autovac = GetBlockingAutoVacuumPgproc(); PGXACT *autovac_pgxact = &ProcGlobal->allPgXact[autovac->pgprocno]; LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); /* * Only do it if the worker is not working to protect against Xid * wraparound. */ if ((autovac_pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) && !(autovac_pgxact->vacuumFlags & PROC_VACUUM_FOR_WRAPAROUND)) { int pid = autovac->pid; StringInfoData locktagbuf; StringInfoData logbuf; /* errdetail for server log */ initStringInfo(&locktagbuf); initStringInfo(&logbuf); DescribeLockTag(&locktagbuf, &lock->tag); appendStringInfo(&logbuf, _("Process %d waits for %s on %s."), MyProcPid, GetLockmodeName(lock->tag.locktag_lockmethodid, lockmode), locktagbuf.data); /* release lock as quickly as possible */ LWLockRelease(ProcArrayLock); ereport(LOG, (errmsg("sending cancel to blocking autovacuum PID %d", pid), errdetail_log("%s", logbuf.data))); pfree(logbuf.data); pfree(locktagbuf.data); /* send the autovacuum worker Back to Old Kent Road */ if (kill(pid, SIGINT) < 0) { /* Just a warning to allow multiple callers */ ereport(WARNING, (errmsg("could not send signal to process %d: %m", pid))); } } else LWLockRelease(ProcArrayLock); /* prevent signal from being resent more than once */ allow_autovacuum_cancel = false; } /* * If awoken after the deadlock check interrupt has run, and * log_lock_waits is on, then report about the wait. */ if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED) { StringInfoData buf, lock_waiters_sbuf, lock_holders_sbuf; const char *modename; long secs; int usecs; long msecs; SHM_QUEUE *procLocks; PROCLOCK *proclock; bool first_holder = true, first_waiter = true; int lockHoldersNum = 0; initStringInfo(&buf); initStringInfo(&lock_waiters_sbuf); initStringInfo(&lock_holders_sbuf); DescribeLockTag(&buf, &locallock->tag.lock); modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid, lockmode); TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT), GetCurrentTimestamp(), &secs, &usecs); msecs = secs * 1000 + usecs / 1000; usecs = usecs % 1000; /* * we loop over the lock's procLocks to gather a list of all * holders and waiters. Thus we will be able to provide more * detailed information for lock debugging purposes. * * lock->procLocks contains all processes which hold or wait for * this lock. */ LWLockAcquire(partitionLock, LW_SHARED); procLocks = &(lock->procLocks); proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, lockLink)); while (proclock) { /* * we are a waiter if myProc->waitProcLock == proclock; we are * a holder if it is NULL or something different */ if (proclock->tag.myProc->waitProcLock == proclock) { if (first_waiter) { appendStringInfo(&lock_waiters_sbuf, "%d", proclock->tag.myProc->pid); first_waiter = false; } else appendStringInfo(&lock_waiters_sbuf, ", %d", proclock->tag.myProc->pid); } else { if (first_holder) { appendStringInfo(&lock_holders_sbuf, "%d", proclock->tag.myProc->pid); first_holder = false; } else appendStringInfo(&lock_holders_sbuf, ", %d", proclock->tag.myProc->pid); lockHoldersNum++; } proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink, offsetof(PROCLOCK, lockLink)); } LWLockRelease(partitionLock); if (deadlock_state == DS_SOFT_DEADLOCK) ereport(LOG, (errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); else if (deadlock_state == DS_HARD_DEADLOCK) { /* * This message is a bit redundant with the error that will be * reported subsequently, but in some cases the error report * might not make it to the log (eg, if it's caught by an * exception handler), and we want to ensure all long-wait * events get logged. */ ereport(LOG, (errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); } if (myWaitStatus == STATUS_WAITING) ereport(LOG, (errmsg("process %d still waiting for %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); else if (myWaitStatus == STATUS_OK) ereport(LOG, (errmsg("process %d acquired %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs))); else { Assert(myWaitStatus == STATUS_ERROR); /* * Currently, the deadlock checker always kicks its own * process, which means that we'll only see STATUS_ERROR when * deadlock_state == DS_HARD_DEADLOCK, and there's no need to * print redundant messages. But for completeness and * future-proofing, print a message if it looks like someone * else kicked us off the lock. */ if (deadlock_state != DS_HARD_DEADLOCK) ereport(LOG, (errmsg("process %d failed to acquire %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); } /* * At this point we might still need to wait for the lock. Reset * state so we don't print the above messages again. */ deadlock_state = DS_NO_DEADLOCK; pfree(buf.data); pfree(lock_holders_sbuf.data); pfree(lock_waiters_sbuf.data); } } while (myWaitStatus == STATUS_WAITING); /* * Disable the timers, if they are still running. As in LockErrorCleanup, * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has * already caused QueryCancelPending to become set, we want the cancel to * be reported as a lock timeout, not a user cancel. */ if (LockTimeout > 0) { DisableTimeoutParams timeouts[2]; timeouts[0].id = DEADLOCK_TIMEOUT; timeouts[0].keep_indicator = false; timeouts[1].id = LOCK_TIMEOUT; timeouts[1].keep_indicator = true; disable_timeouts(timeouts, 2); } else disable_timeout(DEADLOCK_TIMEOUT, false); /* * Re-acquire the lock table's partition lock. We have to do this to hold * off cancel/die interrupts before we can mess with lockAwaited (else we * might have a missed or duplicated locallock update). */ LWLockAcquire(partitionLock, LW_EXCLUSIVE); /* * We no longer want LockErrorCleanup to do anything. */ lockAwaited = NULL; /* * If we got the lock, be sure to remember it in the locallock table. */ if (MyProc->waitStatus == STATUS_OK) GrantAwaitedLock(); /* * We don't have to do anything else, because the awaker did all the * necessary update of the lock table and MyProc. */ return MyProc->waitStatus; }
/* * TransactionTreeSetCommitTsData * * Record the final commit timestamp of transaction entries in the commit log * for a transaction and its subtransaction tree, as efficiently as possible. * * xid is the top level transaction id. * * subxids is an array of xids of length nsubxids, representing subtransactions * in the tree of xid. In various cases nsubxids may be zero. * The reason why tracking just the parent xid commit timestamp is not enough * is that the subtrans SLRU does not stay valid across crashes (it's not * permanent) so we need to keep the information about them here. If the * subtrans implementation changes in the future, we might want to revisit the * decision of storing timestamp info for each subxid. * * The write_xlog parameter tells us whether to include an XLog record of this * or not. Normally, this is called from transaction commit routines (both * normal and prepared) and the information will be stored in the transaction * commit XLog record, and so they should pass "false" for this. The XLog redo * code should use "false" here as well. Other callers probably want to pass * true, so that the given values persist in case of crashes. */ void TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids, TransactionId *subxids, TimestampTz timestamp, RepOriginId nodeid, bool write_xlog) { int i; TransactionId headxid; TransactionId newestXact; /* * No-op if the module is not active. * * An unlocked read here is fine, because in a standby (the only place * where the flag can change in flight) this routine is only called by the * recovery process, which is also the only process which can change the * flag. */ if (!commitTsShared->commitTsActive) return; /* * Comply with the WAL-before-data rule: if caller specified it wants this * value to be recorded in WAL, do so before touching the data. */ if (write_xlog) WriteSetTimestampXlogRec(xid, nsubxids, subxids, timestamp, nodeid); /* * Figure out the latest Xid in this batch: either the last subxid if * there's any, otherwise the parent xid. */ if (nsubxids > 0) newestXact = subxids[nsubxids - 1]; else newestXact = xid; /* * We split the xids to set the timestamp to in groups belonging to the * same SLRU page; the first element in each such set is its head. The * first group has the main XID as the head; subsequent sets use the first * subxid not on the previous page as head. This way, we only have to * lock/modify each SLRU page once. */ for (i = 0, headxid = xid;;) { int pageno = TransactionIdToCTsPage(headxid); int j; for (j = i; j < nsubxids; j++) { if (TransactionIdToCTsPage(subxids[j]) != pageno) break; } /* subxids[i..j] are on the same page as the head */ SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid, pageno); /* if we wrote out all subxids, we're done. */ if (j + 1 >= nsubxids) break; /* * Set the new head and skip over it, as well as over the subxids we * just wrote. */ headxid = subxids[j]; i += j - i + 1; } /* update the cached value in shared memory */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); commitTsShared->xidLastCommit = xid; commitTsShared->dataLastCommit.time = timestamp; commitTsShared->dataLastCommit.nodeid = nodeid; /* and move forwards our endpoint, if needed */ if (TransactionIdPrecedes(ShmemVariableCache->newestCommitTsXid, newestXact)) ShmemVariableCache->newestCommitTsXid = newestXact; LWLockRelease(CommitTsLock); }
/* * Remove all segments before the one holding the passed page number */ void SimpleLruTruncate(SlruCtl ctl, int cutoffPage) { SlruShared shared = ctl->shared; int slotno; /* * The cutoff point is the start of the segment containing cutoffPage. */ cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT; /* * Scan shared memory and remove any pages preceding the cutoff page, to * ensure we won't rewrite them later. (Since this is normally called in * or just after a checkpoint, any dirty pages should have been flushed * already ... we're just being extra careful here.) */ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); restart:; /* * While we are holding the lock, make an important safety check: the * planned cutoff point must be <= the current endpoint page. Otherwise we * have already wrapped around, and proceeding with the truncation would * risk removing the current segment. */ if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage)) { LWLockRelease(shared->ControlLock); ereport(LOG, (errmsg("could not truncate directory \"%s\": apparent wraparound", ctl->Dir))); return; } for (slotno = 0; slotno < shared->num_slots; slotno++) { if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) continue; if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage)) continue; /* * If page is clean, just change state to EMPTY (expected case). */ if (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno]) { shared->page_status[slotno] = SLRU_PAGE_EMPTY; continue; } /* * Hmm, we have (or may have) I/O operations acting on the page, so * we've got to wait for them to finish and then start again. This is * the same logic as in SlruSelectLRUPage. (XXX if page is dirty, * wouldn't it be OK to just discard it without writing it? For now, * keep the logic the same as it was.) */ if (shared->page_status[slotno] == SLRU_PAGE_VALID) SlruInternalWritePage(ctl, slotno, NULL); else SimpleLruWaitIO(ctl, slotno); goto restart; } LWLockRelease(shared->ControlLock); /* Now we can remove the old segment(s) */ (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage); }