/* * TransactionIdPrecedesOrEquals --- is id1 logically <= id2? */ bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2) { int32 diff; if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) return (id1 <= id2); diff = (int32) (id1 - id2); return (diff <= 0); }
/* * TransactionIdFollowsOrEquals --- is id1 logically >= id2? */ bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2) { int32 diff; if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) return (id1 >= id2); diff = (int32) (id1 - id2); return (diff >= 0); }
/* * TransactionIdPrecedes --- is id1 logically < id2? */ bool TransactionIdPrecedes(TransactionId id1, TransactionId id2) { /* * If either ID is a permanent XID then we can just do unsigned * comparison. If both are normal, do a modulo-2^31 comparison. */ int32 diff; if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) return (id1 < id2); diff = (int32) (id1 - id2); return (diff < 0); }
/* * Interrogate the parent of a transaction in the subtrans log. */ TransactionId SubTransGetParent(TransactionId xid) { int pageno = TransactionIdToPage(xid); int entryno = TransactionIdToEntry(xid); int slotno; TransactionId *ptr; TransactionId parent; /* Can't ask about stuff that might not be around anymore */ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); /* Bootstrap and frozen XIDs have no parent */ if (!TransactionIdIsNormal(xid)) return InvalidTransactionId; /* lock is acquired by SimpleLruReadPage_ReadOnly */ slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid); ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; ptr += entryno; parent = *ptr; LWLockRelease(SubtransControlLock); return parent; }
/* * do a TransactionId -> txid conversion for an XID near the given epoch */ static txid convert_xid(TransactionId xid, const TxidEpoch *state) { #ifndef INT64_IS_BUSTED uint64 epoch; /* return special xid's as-is */ if (!TransactionIdIsNormal(xid)) return (txid) xid; /* xid can be on either side when near wrap-around */ epoch = (uint64) state->epoch; if (xid > state->last_xid && TransactionIdPrecedes(xid, state->last_xid)) epoch--; else if (xid < state->last_xid && TransactionIdFollows(xid, state->last_xid)) epoch++; return (epoch << 32) | xid; #else /* INT64_IS_BUSTED */ /* we can't do anything with the epoch, so ignore it */ return (txid) xid & MAX_TXID; #endif /* INT64_IS_BUSTED */ }
/* * TransactionIdGetCommitLSN * * This function returns an LSN that is late enough to be able * to guarantee that if we flush up to the LSN returned then we * will have flushed the transaction's commit record to disk. * * The result is not necessarily the exact LSN of the transaction's * commit record! For example, for long-past transactions (those whose * clog pages already migrated to disk), we'll return InvalidXLogRecPtr. * Also, because we group transactions on the same clog page to conserve * storage, we might return the LSN of a later transaction that falls into * the same group. */ XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid) { XLogRecPtr result; /* * Currently, all uses of this function are for xids that were just * reported to be committed by TransactionLogFetch, so we expect that * checking TransactionLogFetch's cache will usually succeed and avoid an * extra trip to shared memory. */ if (TransactionIdEquals(xid, cachedFetchXid)) return cachedCommitLSN; /* Special XIDs are always known committed */ if (!TransactionIdIsNormal(xid)) return InvalidXLogRecPtr; /* * Get the transaction status. */ (void) TransactionIdGetStatus(xid, &result); return result; }
/* * GetOldestXmin -- returns oldest transaction that was running * when any current transaction was started. * * If allDbs is TRUE then all backends are considered; if allDbs is FALSE * then only backends running in my own database are considered. * * This is used by VACUUM to decide which deleted tuples must be preserved * in a table. allDbs = TRUE is needed for shared relations, but allDbs = * FALSE is sufficient for non-shared relations, since only backends in my * own database could ever see the tuples in them. * * Note: we include the currently running xids in the set of considered xids. * This ensures that if a just-started xact has not yet set its snapshot, * when it does set the snapshot it cannot set xmin less than what we compute. */ TransactionId GetOldestXmin(bool allDbs) { SISeg *segP = shmInvalBuffer; ProcState *stateP = segP->procState; TransactionId result; int index; result = GetCurrentTransactionId(); LWLockAcquire(SInvalLock, LW_SHARED); for (index = 0; index < segP->lastBackend; index++) { SHMEM_OFFSET pOffset = stateP[index].procStruct; if (pOffset != INVALID_OFFSET) { PGPROC *proc = (PGPROC *) MAKE_PTR(pOffset); if (allDbs || proc->databaseId == MyDatabaseId) { /* Fetch xid just once - see GetNewTransactionId */ TransactionId xid = proc->xid; if (TransactionIdIsNormal(xid)) { if (TransactionIdPrecedes(xid, result)) result = xid; xid = proc->xmin; if (TransactionIdIsNormal(xid)) if (TransactionIdPrecedes(xid, result)) result = xid; } } } } LWLockRelease(SInvalLock); return result; }
/* Record lowest soon-prunable XID */ static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid) { /* * This should exactly match the PageSetPrunable macro. We can't store * directly into the page header yet, so we update working state. */ Assert(TransactionIdIsNormal(xid)); if (!TransactionIdIsValid(prstate->new_prune_xid) || TransactionIdPrecedes(xid, prstate->new_prune_xid)) prstate->new_prune_xid = xid; }
/* * xid_age - compute age of an XID (relative to current xact) */ Datum xid_age(PG_FUNCTION_ARGS) { TransactionId xid = PG_GETARG_TRANSACTIONID(0); TransactionId now = GetTopTransactionId(); /* Permanent XIDs are always infinitely old */ if (!TransactionIdIsNormal(xid)) PG_RETURN_INT32(INT_MAX); PG_RETURN_INT32((int32) (now - xid)); }
/* * Sets the commit timestamp of a single transaction. * * Must be called with CommitTsControlLock held */ static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, RepOriginId nodeid, int slotno) { int entryno = TransactionIdToCTsEntry(xid); CommitTimestampEntry entry; Assert(TransactionIdIsNormal(xid)); entry.time = ts; entry.nodeid = nodeid; memcpy(CommitTsCtl->shared->page_buffer[slotno] + SizeOfCommitTimestampEntry * entryno, &entry, SizeOfCommitTimestampEntry); }
/* * Get oldest Xid visible by any active transaction (global or local) * Take in account global Xmin received from DTMD */ static TransactionId DtmGetOldestXmin(Relation rel, bool ignoreVacuum) { TransactionId localXmin = PgGetOldestXmin(rel, ignoreVacuum); TransactionId globalXmin = dtm->minXid; XTM_INFO("XTM: DtmGetOldestXmin localXmin=%d, globalXmin=%d\n", localXmin, globalXmin); if (TransactionIdIsValid(globalXmin)) { globalXmin -= vacuum_defer_cleanup_age; if (!TransactionIdIsNormal(globalXmin)) globalXmin = FirstNormalTransactionId; if (TransactionIdPrecedes(globalXmin, localXmin)) localXmin = globalXmin; XTM_INFO("XTM: DtmGetOldestXmin adjusted localXmin=%d, globalXmin=%d\n", localXmin, globalXmin); } return localXmin; }
/* * TransactionLogFetch --- fetch commit status of specified transaction id */ static XidStatus TransactionLogFetch(TransactionId transactionId) { XidStatus xidstatus; XLogRecPtr xidlsn; /* * Before going to the commit log manager, check our single item cache to * see if we didn't just check the transaction status a moment ago. */ if (TransactionIdEquals(transactionId, cachedFetchXid)) return cachedFetchXidStatus; /* * Also, check to see if the transaction ID is a permanent one. */ if (!TransactionIdIsNormal(transactionId)) { if (TransactionIdEquals(transactionId, BootstrapTransactionId)) return TRANSACTION_STATUS_COMMITTED; if (TransactionIdEquals(transactionId, FrozenTransactionId)) return TRANSACTION_STATUS_COMMITTED; return TRANSACTION_STATUS_ABORTED; } /* * Get the transaction status. */ xidstatus = TransactionIdGetStatus(transactionId, &xidlsn); /* * Cache it, but DO NOT cache status for unfinished or sub-committed * transactions! We only cache status that is guaranteed not to change. */ if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS && xidstatus != TRANSACTION_STATUS_SUB_COMMITTED) { cachedFetchXid = transactionId; cachedFetchXidStatus = xidstatus; cachedCommitLSN = xidlsn; } return xidstatus; }
static void SubTransGetData(TransactionId xid, SubTransData* subData) { MIRRORED_LOCK_DECLARE; int pageno = TransactionIdToPage(xid); int entryno = TransactionIdToEntry(xid); int slotno; SubTransData *ptr; /* Can't ask about stuff that might not be around anymore */ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); /* Bootstrap and frozen XIDs have no parent and itself as topMostParent */ if (!TransactionIdIsNormal(xid)) { subData->parent = InvalidTransactionId; subData->topMostParent = xid; return; } MIRRORED_LOCK; /* lock is acquired by SimpleLruReadPage_ReadOnly */ slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid, NULL); ptr = (SubTransData *) SubTransCtl->shared->page_buffer[slotno]; ptr += entryno; subData->parent = ptr->parent; subData->topMostParent = ptr->topMostParent; if ( subData->topMostParent == InvalidTransactionId ) { /* Here means parent is Main XID, hence set parent itself as topMostParent */ subData->topMostParent = xid; } LWLockRelease(SubtransControlLock); MIRRORED_UNLOCK; return; }
/* adapted from Postgres' txid.c#convert_xid function */ uint64 convert_xid(TransactionId xid) { TxidEpoch state; uint64 epoch; GetNextXidAndEpoch(&state.last_xid, &state.epoch); /* return special xid's as-is */ if (!TransactionIdIsNormal(xid)) return (uint64) xid; /* xid can be on either side when near wrap-around */ epoch = (uint64) state.epoch; if (xid > state.last_xid && TransactionIdPrecedes(xid, state.last_xid)) epoch--; else if (xid < state.last_xid && TransactionIdFollows(xid, state.last_xid)) epoch++; return (epoch << 32) | xid; }
/* * do a TransactionId -> txid conversion for an XID near the given epoch */ static txid convert_xid(TransactionId xid, const TxidEpoch *state) { uint64 epoch; /* return special xid's as-is */ if (!TransactionIdIsNormal(xid)) return (txid) xid; /* xid can be on either side when near wrap-around */ epoch = (uint64) state->epoch; if (xid > state->last_xid && TransactionIdPrecedes(xid, state->last_xid)) epoch--; else if (xid < state->last_xid && TransactionIdFollows(xid, state->last_xid)) epoch++; return (epoch << 32) | xid; }
Datum pg_last_committed_xact(PG_FUNCTION_ARGS) { TransactionId xid; TimestampTz ts; Datum values[2]; bool nulls[2]; TupleDesc tupdesc; HeapTuple htup; /* and construct a tuple with our data */ xid = GetLatestCommitTsData(&ts, NULL); /* * Construct a tuple descriptor for the result row. This must match this * function's pg_proc entry! */ tupdesc = CreateTemplateTupleDesc(2, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "timestamp", TIMESTAMPTZOID, -1, 0); tupdesc = BlessTupleDesc(tupdesc); if (!TransactionIdIsNormal(xid)) { memset(nulls, true, sizeof(nulls)); } else { values[0] = TransactionIdGetDatum(xid); nulls[0] = false; values[1] = TimestampTzGetDatum(ts); nulls[1] = false; } htup = heap_form_tuple(tupdesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(htup)); }
/* * Update local Recent*Xmin variables taken in account MinXmin received from DTMD */ static void DtmUpdateRecentXmin(Snapshot snapshot) { TransactionId xmin = dtm->minXid; XTM_INFO("XTM: DtmUpdateRecentXmin global xmin=%d, snapshot xmin %d\n", dtm->minXid, DtmSnapshot.xmin); if (TransactionIdIsValid(xmin)) { xmin -= vacuum_defer_cleanup_age; if (!TransactionIdIsNormal(xmin)) xmin = FirstNormalTransactionId; if (TransactionIdFollows(RecentGlobalDataXmin, xmin)) RecentGlobalDataXmin = xmin; if (TransactionIdFollows(RecentGlobalXmin, xmin)) RecentGlobalXmin = xmin; } if (TransactionIdFollows(RecentXmin, snapshot->xmin)) { ProcArrayInstallImportedXmin(snapshot->xmin, GetCurrentTransactionId()); RecentXmin = snapshot->xmin; } }
/*---------- * GetSnapshotData -- returns information about running transactions. * * The returned snapshot includes xmin (lowest still-running xact ID), * xmax (next xact ID to be assigned), and a list of running xact IDs * in the range xmin <= xid < xmax. It is used as follows: * All xact IDs < xmin are considered finished. * All xact IDs >= xmax are considered still running. * For an xact ID xmin <= xid < xmax, consult list to see whether * it is considered running or not. * This ensures that the set of transactions seen as "running" by the * current xact will not change after it takes the snapshot. * * We also compute the current global xmin (oldest xmin across all running * transactions) and save it in RecentGlobalXmin. This is the same * computation done by GetOldestXmin(TRUE). The xmin value is also stored * into RecentXmin. *---------- */ Snapshot GetSnapshotData(Snapshot snapshot, bool serializable) { SISeg *segP = shmInvalBuffer; ProcState *stateP = segP->procState; TransactionId xmin; TransactionId xmax; TransactionId globalxmin; int index; int count = 0; Assert(snapshot != NULL); /* * Allocating space for MaxBackends xids is usually overkill; * lastBackend would be sufficient. But it seems better to do the * malloc while not holding the lock, so we can't look at lastBackend. * * This does open a possibility for avoiding repeated malloc/free: * since MaxBackends does not change at runtime, we can simply reuse * the previous xip array if any. (This relies on the fact that all * calls pass static SnapshotData structs.) */ if (snapshot->xip == NULL) { /* * First call for this snapshot */ snapshot->xip = (TransactionId *) malloc(MaxBackends * sizeof(TransactionId)); if (snapshot->xip == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } globalxmin = xmin = GetCurrentTransactionId(); /* * If we are going to set MyProc->xmin then we'd better get exclusive * lock; if not, this is a read-only operation so it can be shared. */ LWLockAcquire(SInvalLock, serializable ? LW_EXCLUSIVE : LW_SHARED); /*-------------------- * Unfortunately, we have to call ReadNewTransactionId() after acquiring * SInvalLock above. It's not good because ReadNewTransactionId() does * LWLockAcquire(XidGenLock), but *necessary*. We need to be sure that * no transactions exit the set of currently-running transactions * between the time we fetch xmax and the time we finish building our * snapshot. Otherwise we could have a situation like this: * * 1. Tx Old is running (in Read Committed mode). * 2. Tx S reads new transaction ID into xmax, then * is swapped out before acquiring SInvalLock. * 3. Tx New gets new transaction ID (>= S' xmax), * makes changes and commits. * 4. Tx Old changes some row R changed by Tx New and commits. * 5. Tx S finishes getting its snapshot data. It sees Tx Old as * done, but sees Tx New as still running (since New >= xmax). * * Now S will see R changed by both Tx Old and Tx New, *but* does not * see other changes made by Tx New. If S is supposed to be in * Serializable mode, this is wrong. * * By locking SInvalLock before we read xmax, we ensure that TX Old * cannot exit the set of running transactions seen by Tx S. Therefore * both Old and New will be seen as still running => no inconsistency. *-------------------- */ xmax = ReadNewTransactionId(); for (index = 0; index < segP->lastBackend; index++) { SHMEM_OFFSET pOffset = stateP[index].procStruct; if (pOffset != INVALID_OFFSET) { PGPROC *proc = (PGPROC *) MAKE_PTR(pOffset); /* Fetch xid just once - see GetNewTransactionId */ TransactionId xid = proc->xid; /* * Ignore my own proc (dealt with my xid above), procs not * running a transaction, and xacts started since we read the * next transaction ID. There's no need to store XIDs above * what we got from ReadNewTransactionId, since we'll treat * them as running anyway. We also assume that such xacts * can't compute an xmin older than ours, so they needn't be * considered in computing globalxmin. */ if (proc == MyProc || !TransactionIdIsNormal(xid) || TransactionIdFollowsOrEquals(xid, xmax)) continue; if (TransactionIdPrecedes(xid, xmin)) xmin = xid; snapshot->xip[count] = xid; count++; /* Update globalxmin to be the smallest valid xmin */ xid = proc->xmin; if (TransactionIdIsNormal(xid)) if (TransactionIdPrecedes(xid, globalxmin)) globalxmin = xid; } } if (serializable) MyProc->xmin = xmin; LWLockRelease(SInvalLock); /* Serializable snapshot must be computed before any other... */ Assert(TransactionIdIsValid(MyProc->xmin)); /* * Update globalxmin to include actual process xids. This is a * slightly different way of computing it than GetOldestXmin uses, but * should give the same result. */ if (TransactionIdPrecedes(xmin, globalxmin)) globalxmin = xmin; /* Update globals for use by VACUUM */ RecentGlobalXmin = globalxmin; RecentXmin = xmin; snapshot->xmin = xmin; snapshot->xmax = xmax; snapshot->xcnt = count; snapshot->curcid = GetCurrentCommandId(); return snapshot; }
/* * Interrogate the commit timestamp of a transaction. * * The return value indicates whether a commit timestamp record was found for * the given xid. The timestamp value is returned in *ts (which may not be * null), and the origin node for the Xid is returned in *nodeid, if it's not * null. */ bool TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, RepOriginId *nodeid) { int pageno = TransactionIdToCTsPage(xid); int entryno = TransactionIdToCTsEntry(xid); int slotno; CommitTimestampEntry entry; TransactionId oldestCommitTsXid; TransactionId newestCommitTsXid; if (!TransactionIdIsValid(xid)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot retrieve commit timestamp for transaction %u", xid))); else if (!TransactionIdIsNormal(xid)) { /* frozen and bootstrap xids are always committed far in the past */ *ts = 0; if (nodeid) *nodeid = 0; return false; } LWLockAcquire(CommitTsLock, LW_SHARED); /* Error if module not enabled */ if (!commitTsShared->commitTsActive) error_commit_ts_disabled(); /* * If we're asked for the cached value, return that. Otherwise, fall * through to read from SLRU. */ if (commitTsShared->xidLastCommit == xid) { *ts = commitTsShared->dataLastCommit.time; if (nodeid) *nodeid = commitTsShared->dataLastCommit.nodeid; LWLockRelease(CommitTsLock); return *ts != 0; } oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid; newestCommitTsXid = ShmemVariableCache->newestCommitTsXid; /* neither is invalid, or both are */ Assert(TransactionIdIsValid(oldestCommitTsXid) == TransactionIdIsValid(newestCommitTsXid)); LWLockRelease(CommitTsLock); /* * Return empty if the requested value is outside our valid range. */ if (!TransactionIdIsValid(oldestCommitTsXid) || TransactionIdPrecedes(xid, oldestCommitTsXid) || TransactionIdPrecedes(newestCommitTsXid, xid)) { *ts = 0; if (nodeid) *nodeid = InvalidRepOriginId; return false; } /* lock is acquired by SimpleLruReadPage_ReadOnly */ slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid); memcpy(&entry, CommitTsCtl->shared->page_buffer[slotno] + SizeOfCommitTimestampEntry * entryno, SizeOfCommitTimestampEntry); *ts = entry.time; if (nodeid) *nodeid = entry.nodeid; LWLockRelease(CommitTsControlLock); return *ts != 0; }
/* * write_database_file: update the flat database file * * A side effect is to determine the oldest database's datfrozenxid * so we can set or update the XID wrap limit. * * Also, if "startup" is true, we tell relcache.c to clear out the relcache * init file in each database. That's a bit nonmodular, but scanning * pg_database twice during system startup seems too high a price for keeping * things better separated. */ static void write_database_file(Relation drel, bool startup) { StringInfoData buffer; HeapScanDesc scan; HeapTuple tuple; NameData oldest_datname; TransactionId oldest_datfrozenxid = InvalidTransactionId; MirroredFlatFileOpen mirroredOpen; initStringInfo(&buffer); MirroredFlatFile_Open( &mirroredOpen, "global", "pg_database", O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR, /* suppressError */ false, /* atomic operation */ true, /*isMirrorRecovery */ false); /* * Read pg_database and write the file. */ scan = heap_beginscan(drel, SnapshotNow, 0, NULL); while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple); char *datname; Oid datoid; Oid dattablespace; TransactionId datfrozenxid; datname = NameStr(dbform->datname); datoid = HeapTupleGetOid(tuple); dattablespace = dbform->dattablespace; datfrozenxid = dbform->datfrozenxid; /* * Identify the oldest datfrozenxid. This must match * the logic in vac_truncate_clog() in vacuum.c. * * MPP-20053: Skip databases that cannot be connected to in computing * the oldest database. */ if (dbform->datallowconn && TransactionIdIsNormal(datfrozenxid)) { if (oldest_datfrozenxid == InvalidTransactionId || TransactionIdPrecedes(datfrozenxid, oldest_datfrozenxid)) { oldest_datfrozenxid = datfrozenxid; namestrcpy(&oldest_datname, datname); } } /* * Check for illegal characters in the database name. */ if (!name_okay(datname)) { ereport(LOG, (errmsg("invalid database name \"%s\"", datname))); continue; } /* * The file format is: "dbname" oid tablespace frozenxid * * The xids are not needed for backend startup, but are of use to * autovacuum, and might also be helpful for forensic purposes. */ sputs_quote(&buffer, datname); appendStringInfo(&buffer, " %u %u %u\n", datoid, dattablespace, datfrozenxid); /* * MPP-10111 - During database expansion we need to be able to bring a * database up in order to correct the filespace locations in the * catalog. At this point we will not be able to resolve database paths * for databases not stored in "pg_default" or "pg_global". * * This is solved by passing a special guc to the startup during this * phase of expand to bypass logic involving non-system tablespaces. * Since we are bypassing the clearing of the relation cache on these * databases we need to ensure that we don't try to use them at all * elsewhere. This is done with a similar check in * PersistentTablespace_GetPrimaryAndMirrorFilespaces(). */ if (gp_before_filespace_setup && !IsBuiltinTablespace(dattablespace)) continue; } heap_endscan(scan); MirroredFlatFile_Append(&mirroredOpen, buffer.data, buffer.len, /* suppressError */ false); MirroredFlatFile_Flush(&mirroredOpen, /* suppressError */ false); MirroredFlatFile_Close(&mirroredOpen); if (buffer.maxlen > 0) pfree(buffer.data); /* * Set the transaction ID wrap limit using the oldest datfrozenxid */ if (oldest_datfrozenxid != InvalidTransactionId) SetTransactionIdLimit(oldest_datfrozenxid, &oldest_datname); }
/* * Hot Standby feedback */ static void ProcessStandbyHSFeedbackMessage(void) { StandbyHSFeedbackMessage reply; TransactionId nextXid; uint32 nextEpoch; /* Decipher the reply message */ pq_copymsgbytes(&reply_message, (char *) &reply, sizeof(StandbyHSFeedbackMessage)); elog(DEBUG2, "hot standby feedback xmin %u epoch %u", reply.xmin, reply.epoch); /* Ignore invalid xmin (can't actually happen with current walreceiver) */ if (!TransactionIdIsNormal(reply.xmin)) return; /* * Check that the provided xmin/epoch are sane, that is, not in the future * and not so far back as to be already wrapped around. Ignore if not. * * Epoch of nextXid should be same as standby, or if the counter has * wrapped, then one greater than standby. */ GetNextXidAndEpoch(&nextXid, &nextEpoch); if (reply.xmin <= nextXid) { if (reply.epoch != nextEpoch) return; } else { if (reply.epoch + 1 != nextEpoch) return; } if (!TransactionIdPrecedesOrEquals(reply.xmin, nextXid)) return; /* epoch OK, but it's wrapped around */ /* * Set the WalSender's xmin equal to the standby's requested xmin, so that * the xmin will be taken into account by GetOldestXmin. This will hold * back the removal of dead rows and thereby prevent the generation of * cleanup conflicts on the standby server. * * There is a small window for a race condition here: although we just * checked that reply.xmin precedes nextXid, the nextXid could have gotten * advanced between our fetching it and applying the xmin below, perhaps * far enough to make reply.xmin wrap around. In that case the xmin we * set here would be "in the future" and have no effect. No point in * worrying about this since it's too late to save the desired data * anyway. Assuming that the standby sends us an increasing sequence of * xmins, this could only happen during the first reply cycle, else our * own xmin would prevent nextXid from advancing so far. * * We don't bother taking the ProcArrayLock here. Setting the xmin field * is assumed atomic, and there's no real need to prevent a concurrent * GetOldestXmin. (If we're moving our xmin forward, this is obviously * safe, and if we're moving it backwards, well, the data is at risk * already since a VACUUM could have just finished calling GetOldestXmin.) */ MyPgXact->xmin = reply.xmin; }
/* * This is a copy of swap_relation_files in cluster.c, but it also swaps * relfrozenxid. */ static void swap_heap_or_index_files(Oid r1, Oid r2) { Relation relRelation; HeapTuple reltup1, reltup2; Form_pg_class relform1, relform2; Oid swaptemp; CatalogIndexState indstate; /* We need writable copies of both pg_class tuples. */ relRelation = heap_open(RelationRelationId, RowExclusiveLock); reltup1 = SearchSysCacheCopy(RELOID, ObjectIdGetDatum(r1), 0, 0, 0); if (!HeapTupleIsValid(reltup1)) elog(ERROR, "cache lookup failed for relation %u", r1); relform1 = (Form_pg_class) GETSTRUCT(reltup1); reltup2 = SearchSysCacheCopy(RELOID, ObjectIdGetDatum(r2), 0, 0, 0); if (!HeapTupleIsValid(reltup2)) elog(ERROR, "cache lookup failed for relation %u", r2); relform2 = (Form_pg_class) GETSTRUCT(reltup2); Assert(relform1->relkind == relform2->relkind); /* * Actually swap the fields in the two tuples */ swaptemp = relform1->relfilenode; relform1->relfilenode = relform2->relfilenode; relform2->relfilenode = swaptemp; swaptemp = relform1->reltablespace; relform1->reltablespace = relform2->reltablespace; relform2->reltablespace = swaptemp; swaptemp = relform1->reltoastrelid; relform1->reltoastrelid = relform2->reltoastrelid; relform2->reltoastrelid = swaptemp; /* set rel1's frozen Xid to larger one */ if (TransactionIdIsNormal(relform1->relfrozenxid)) { if (TransactionIdFollows(relform1->relfrozenxid, relform2->relfrozenxid)) relform1->relfrozenxid = relform2->relfrozenxid; else relform2->relfrozenxid = relform1->relfrozenxid; } /* swap size statistics too, since new rel has freshly-updated stats */ { #if PG_VERSION_NUM >= 90300 int32 swap_pages; #else int4 swap_pages; #endif float4 swap_tuples; swap_pages = relform1->relpages; relform1->relpages = relform2->relpages; relform2->relpages = swap_pages; swap_tuples = relform1->reltuples; relform1->reltuples = relform2->reltuples; relform2->reltuples = swap_tuples; } /* Update the tuples in pg_class */ simple_heap_update(relRelation, &reltup1->t_self, reltup1); simple_heap_update(relRelation, &reltup2->t_self, reltup2); /* Keep system catalogs current */ indstate = CatalogOpenIndexes(relRelation); CatalogIndexInsert(indstate, reltup1); CatalogIndexInsert(indstate, reltup2); CatalogCloseIndexes(indstate); /* * If we have toast tables associated with the relations being swapped, * change their dependency links to re-associate them with their new * owning relations. Otherwise the wrong one will get dropped ... * * NOTE: it is possible that only one table has a toast table; this can * happen in CLUSTER if there were dropped columns in the old table, and * in ALTER TABLE when adding or changing type of columns. * * NOTE: at present, a TOAST table's only dependency is the one on its * owning table. If more are ever created, we'd need to use something * more selective than deleteDependencyRecordsFor() to get rid of only the * link we want. */ if (relform1->reltoastrelid || relform2->reltoastrelid) { ObjectAddress baseobject, toastobject; long count; /* Delete old dependencies */ if (relform1->reltoastrelid) { count = deleteDependencyRecordsFor(RelationRelationId, relform1->reltoastrelid, false); if (count != 1) elog(ERROR, "expected one dependency record for TOAST table, found %ld", count); } if (relform2->reltoastrelid) { count = deleteDependencyRecordsFor(RelationRelationId, relform2->reltoastrelid, false); if (count != 1) elog(ERROR, "expected one dependency record for TOAST table, found %ld", count); } /* Register new dependencies */ baseobject.classId = RelationRelationId; baseobject.objectSubId = 0; toastobject.classId = RelationRelationId; toastobject.objectSubId = 0; if (relform1->reltoastrelid) { baseobject.objectId = r1; toastobject.objectId = relform1->reltoastrelid; recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL); } if (relform2->reltoastrelid) { baseobject.objectId = r2; toastobject.objectId = relform2->reltoastrelid; recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL); } } /* * Blow away the old relcache entries now. We need this kluge because * relcache.c keeps a link to the smgr relation for the physical file, and * that will be out of date as soon as we do CommandCounterIncrement. * Whichever of the rels is the second to be cleared during cache * invalidation will have a dangling reference to an already-deleted smgr * relation. Rather than trying to avoid this by ordering operations just * so, it's easiest to not have the relcache entries there at all. * (Fortunately, since one of the entries is local in our transaction, * it's sufficient to clear out our own relcache this way; the problem * cannot arise for other backends when they see our update on the * non-local relation.) */ RelationForgetRelation(r1); RelationForgetRelation(r2); /* Clean up. */ heap_freetuple(reltup1); heap_freetuple(reltup2); heap_close(relRelation, RowExclusiveLock); }
/* * Interrogate the commit timestamp of a transaction. * * The return value indicates whether a commit timestamp record was found for * the given xid. The timestamp value is returned in *ts (which may not be * null), and the origin node for the Xid is returned in *nodeid, if it's not * null. */ bool TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, RepOriginId *nodeid) { int pageno = TransactionIdToCTsPage(xid); int entryno = TransactionIdToCTsEntry(xid); int slotno; CommitTimestampEntry entry; TransactionId oldestCommitTs; TransactionId newestCommitTs; /* error if the given Xid doesn't normally commit */ if (!TransactionIdIsNormal(xid)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot retrieve commit timestamp for transaction %u", xid))); LWLockAcquire(CommitTsLock, LW_SHARED); /* Error if module not enabled */ if (!commitTsShared->commitTsActive) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not get commit timestamp data"), errhint("Make sure the configuration parameter \"%s\" is set.", "track_commit_timestamp"))); /* * If we're asked for the cached value, return that. Otherwise, fall * through to read from SLRU. */ if (commitTsShared->xidLastCommit == xid) { *ts = commitTsShared->dataLastCommit.time; if (nodeid) *nodeid = commitTsShared->dataLastCommit.nodeid; LWLockRelease(CommitTsLock); return *ts != 0; } oldestCommitTs = ShmemVariableCache->oldestCommitTs; newestCommitTs = ShmemVariableCache->newestCommitTs; /* neither is invalid, or both are */ Assert(TransactionIdIsValid(oldestCommitTs) == TransactionIdIsValid(newestCommitTs)); LWLockRelease(CommitTsLock); /* * Return empty if the requested value is outside our valid range. */ if (!TransactionIdIsValid(oldestCommitTs) || TransactionIdPrecedes(xid, oldestCommitTs) || TransactionIdPrecedes(newestCommitTs, xid)) { *ts = 0; if (nodeid) *nodeid = InvalidRepOriginId; return false; } /* lock is acquired by SimpleLruReadPage_ReadOnly */ slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid); memcpy(&entry, CommitTsCtl->shared->page_buffer[slotno] + SizeOfCommitTimestampEntry * entryno, SizeOfCommitTimestampEntry); *ts = entry.time; if (nodeid) *nodeid = entry.nodeid; LWLockRelease(CommitTsControlLock); return *ts != 0; }
/* * ImportSnapshot * Import a previously exported snapshot. The argument should be a * filename in SNAPSHOT_EXPORT_DIR. Load the snapshot from that file. * This is called by "SET TRANSACTION SNAPSHOT 'foo'". */ void ImportSnapshot(const char *idstr) { char path[MAXPGPATH]; FILE *f; struct stat stat_buf; char *filebuf; int xcnt; int i; TransactionId src_xid; Oid src_dbid; int src_isolevel; bool src_readonly; SnapshotData snapshot; /* * Must be at top level of a fresh transaction. Note in particular that * we check we haven't acquired an XID --- if we have, it's conceivable * that the snapshot would show it as not running, making for very * screwy behavior. */ if (FirstSnapshotSet || GetTopTransactionIdIfAny() != InvalidTransactionId || IsSubTransaction()) ereport(ERROR, (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), errmsg("SET TRANSACTION SNAPSHOT must be called before any query"))); /* * If we are in read committed mode then the next query would execute * with a new snapshot thus making this function call quite useless. */ if (!IsolationUsesXactSnapshot()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("a snapshot-importing transaction must have isolation level SERIALIZABLE or REPEATABLE READ"))); /* * Verify the identifier: only 0-9, A-F and hyphens are allowed. We do * this mainly to prevent reading arbitrary files. */ if (strspn(idstr, "0123456789ABCDEF-") != strlen(idstr)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid snapshot identifier \"%s\"", idstr))); /* OK, read the file */ snprintf(path, MAXPGPATH, SNAPSHOT_EXPORT_DIR "/%s", idstr); f = AllocateFile(path, PG_BINARY_R); if (!f) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid snapshot identifier \"%s\"", idstr))); /* get the size of the file so that we know how much memory we need */ if (fstat(fileno(f), &stat_buf)) elog(ERROR, "could not stat file \"%s\": %m", path); /* and read the file into a palloc'd string */ filebuf = (char *) palloc(stat_buf.st_size + 1); if (fread(filebuf, stat_buf.st_size, 1, f) != 1) elog(ERROR, "could not read file \"%s\": %m", path); filebuf[stat_buf.st_size] = '\0'; FreeFile(f); /* * Construct a snapshot struct by parsing the file content. */ memset(&snapshot, 0, sizeof(snapshot)); src_xid = parseXidFromText("xid:", &filebuf, path); /* we abuse parseXidFromText a bit here ... */ src_dbid = parseXidFromText("dbid:", &filebuf, path); src_isolevel = parseIntFromText("iso:", &filebuf, path); src_readonly = parseIntFromText("ro:", &filebuf, path); snapshot.xmin = parseXidFromText("xmin:", &filebuf, path); snapshot.xmax = parseXidFromText("xmax:", &filebuf, path); snapshot.xcnt = xcnt = parseIntFromText("xcnt:", &filebuf, path); /* sanity-check the xid count before palloc */ if (xcnt < 0 || xcnt > GetMaxSnapshotXidCount()) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", path))); snapshot.xip = (TransactionId *) palloc(xcnt * sizeof(TransactionId)); for (i = 0; i < xcnt; i++) snapshot.xip[i] = parseXidFromText("xip:", &filebuf, path); snapshot.suboverflowed = parseIntFromText("sof:", &filebuf, path); if (!snapshot.suboverflowed) { snapshot.subxcnt = xcnt = parseIntFromText("sxcnt:", &filebuf, path); /* sanity-check the xid count before palloc */ if (xcnt < 0 || xcnt > GetMaxSnapshotSubxidCount()) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", path))); snapshot.subxip = (TransactionId *) palloc(xcnt * sizeof(TransactionId)); for (i = 0; i < xcnt; i++) snapshot.subxip[i] = parseXidFromText("sxp:", &filebuf, path); } else { snapshot.subxcnt = 0; snapshot.subxip = NULL; } snapshot.takenDuringRecovery = parseIntFromText("rec:", &filebuf, path); /* * Do some additional sanity checking, just to protect ourselves. We * don't trouble to check the array elements, just the most critical * fields. */ if (!TransactionIdIsNormal(src_xid) || !OidIsValid(src_dbid) || !TransactionIdIsNormal(snapshot.xmin) || !TransactionIdIsNormal(snapshot.xmax)) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", path))); /* * If we're serializable, the source transaction must be too, otherwise * predicate.c has problems (SxactGlobalXmin could go backwards). Also, * a non-read-only transaction can't adopt a snapshot from a read-only * transaction, as predicate.c handles the cases very differently. */ if (IsolationIsSerializable()) { if (src_isolevel != XACT_SERIALIZABLE) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("a serializable transaction cannot import a snapshot from a non-serializable transaction"))); if (src_readonly && !XactReadOnly) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("a non-read-only serializable transaction cannot import a snapshot from a read-only transaction"))); } /* * We cannot import a snapshot that was taken in a different database, * because vacuum calculates OldestXmin on a per-database basis; so the * source transaction's xmin doesn't protect us from data loss. This * restriction could be removed if the source transaction were to mark * its xmin as being globally applicable. But that would require some * additional syntax, since that has to be known when the snapshot is * initially taken. (See pgsql-hackers discussion of 2011-10-21.) */ if (src_dbid != MyDatabaseId) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot import a snapshot from a different database"))); /* OK, install the snapshot */ SetTransactionSnapshot(&snapshot, src_xid); }