static void PersistentStore_DoInsertTuple( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData, Relation persistentRel, /* The persistent table relation. */ Datum *values, bool flushToXLog, /* When true, the XLOG record for this change will be flushed to disk. */ ItemPointer persistentTid) /* TID of the stored tuple. */ { bool *nulls; HeapTuple persistentTuple = NULL; XLogRecPtr xlogInsertEndLoc; /* * In order to keep the tuples the exact same size to enable direct reuse of * free tuples, we do not use NULLs. */ nulls = (bool*)palloc0(storeData->numAttributes * sizeof(bool)); /* * Form the tuple. */ persistentTuple = heap_form_tuple(persistentRel->rd_att, values, nulls); if (!HeapTupleIsValid(persistentTuple)) elog(ERROR, "Failed to build persistent tuple ('%s')", storeData->tableName); frozen_heap_insert( persistentRel, persistentTuple); if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_DoInsertTuple: new insert TID %s ('%s')", ItemPointerToString2(&persistentTuple->t_self), storeData->tableName); /* * Return the TID of the INSERT tuple. * Return the XLOG location of the INSERT tuple's XLOG record. */ *persistentTid = persistentTuple->t_self; xlogInsertEndLoc = XLogLastInsertEndLoc(); heap_freetuple(persistentTuple); if (flushToXLog) { XLogFlush(xlogInsertEndLoc); XLogRecPtr_Zero(&nowaitXLogEndLoc); } else nowaitXLogEndLoc = xlogInsertEndLoc; pfree(nulls); }
/* * Execute the CREATE BARRIER command. Write a BARRIER WAL record and flush the * WAL buffers to disk before returning to the caller. Writing the WAL record * does not guarantee successful completion of the barrier command. */ void ProcessCreateBarrierExecute(const char *id) { StringInfoData buf; if (!IsConnFromCoord()) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("The CREATE BARRIER EXECUTE message is expected to " "arrive from a Coordinator"))); { XLogRecData rdata[1]; XLogRecPtr recptr; rdata[0].data = (char *) id; rdata[0].len = strlen(id) + 1; rdata[0].buffer = InvalidBuffer; rdata[0].next = NULL; recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE, rdata); XLogFlush(recptr); } pq_beginmessage(&buf, 'b'); pq_sendstring(&buf, id); pq_endmessage(&buf); pq_flush(); }
finish_sync_worker(void) { /* * Commit any outstanding transaction. This is the usual case, unless * there was nothing to do for the table. */ if (IsTransactionState()) { CommitTransactionCommand(); pgstat_report_stat(false); } /* And flush all writes. */ XLogFlush(GetXLogWriteRecPtr()); StartTransactionCommand(); ereport(LOG, (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has finished", MySubscription->name, get_rel_name(MyLogicalRepWorker->relid)))); CommitTransactionCommand(); /* Find the main apply worker and signal it. */ logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); /* Stop gracefully */ proc_exit(0); }
void PersistentStore_FlushXLog(void) { if (nowaitXLogEndLoc.xlogid != 0 || nowaitXLogEndLoc.xrecoff != 0) { XLogFlush(nowaitXLogEndLoc); XLogRecPtr_Zero(&nowaitXLogEndLoc); } }
/* * Write a TRUNCATE xlog record * * We must flush the xlog record to disk before returning --- see notes * in DistributedLog_Truncate(). * * Note: xlog record is marked as outside transaction control, since we * want it to be redone whether the invoking transaction commits or not. */ static void DistributedLog_WriteTruncateXlogRec(int page) { XLogRecData rdata; XLogRecPtr recptr; rdata.data = (char *) (&page); rdata.len = sizeof(int); rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_DISTRIBUTEDLOG_ID, DISTRIBUTEDLOG_TRUNCATE | XLOG_NO_TRAN, &rdata); XLogFlush(recptr); }
void PersistentStore_FreeTuple( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData, ItemPointer persistentTid, /* TID of the stored tuple. */ Datum *freeValues, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { Relation persistentRel; XLogRecPtr xlogEndLoc; /* The end location of the UPDATE XLOG record. */ Assert( LWLockHeldByMe(PersistentObjLock) ); #ifdef USE_ASSERT_CHECKING if (storeSharedData == NULL || !PersistentStoreSharedData_EyecatcherIsValid(storeSharedData)) elog(ERROR, "Persistent store shared-memory not valid"); #endif if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_FreeTuple: Going to free tuple at TID %s ('%s', shared data %p)", ItemPointerToString(persistentTid), storeData->tableName, storeSharedData); Assert(ItemPointerIsValid(persistentTid)); persistentRel = (*storeData->openRel)(); simple_heap_delete_xid(persistentRel, persistentTid, FrozenTransactionId); /* * XLOG location of the UPDATE tuple's XLOG record. */ xlogEndLoc = XLogLastInsertEndLoc(); (*storeData->closeRel)(persistentRel); storeSharedData->inUseCount--; if (flushToXLog) { XLogFlush(xlogEndLoc); XLogRecPtr_Zero(&nowaitXLogEndLoc); } else nowaitXLogEndLoc = xlogEndLoc; }
/* * Write out a new shared or local map file with the given contents. * * The magic number and CRC are automatically updated in *newmap. On * success, we copy the data to the appropriate permanent static variable. * * If write_wal is TRUE then an appropriate WAL message is emitted. * (It will be false for bootstrap and WAL replay cases.) * * If send_sinval is TRUE then a SI invalidation message is sent. * (This should be true except in bootstrap case.) * * If preserve_files is TRUE then the storage manager is warned not to * delete the files listed in the map. * * Because this may be called during WAL replay when MyDatabaseId, * DatabasePath, etc aren't valid, we require the caller to pass in suitable * values. The caller is also responsible for being sure no concurrent * map update could be happening. */ static void write_relmap_file(bool shared, RelMapFile *newmap, bool write_wal, bool send_sinval, bool preserve_files, Oid dbid, Oid tsid, const char *dbpath) { int fd; RelMapFile *realmap; char mapfilename[MAXPGPATH]; /* * Fill in the overhead fields and update CRC. */ newmap->magic = RELMAPPER_FILEMAGIC; if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS) elog(ERROR, "attempt to write bogus relation mapping"); INIT_CRC32(newmap->crc); COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc)); FIN_CRC32(newmap->crc); /* * Open the target file. We prefer to do this before entering the * critical section, so that an open() failure need not force PANIC. */ if (shared) { snprintf(mapfilename, sizeof(mapfilename), "global/%s", RELMAPPER_FILENAME); realmap = &shared_map; } else { snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath, RELMAPPER_FILENAME); realmap = &local_map; } fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open relation mapping file \"%s\": %m", mapfilename))); if (write_wal) { xl_relmap_update xlrec; XLogRecData rdata[2]; XLogRecPtr lsn; /* now errors are fatal ... */ START_CRIT_SECTION(); xlrec.dbid = dbid; xlrec.tsid = tsid; xlrec.nbytes = sizeof(RelMapFile); rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfRelmapUpdate; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) newmap; rdata[1].len = sizeof(RelMapFile); rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata); /* As always, WAL must hit the disk before the data update does */ XLogFlush(lsn); } errno = 0; if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile)) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to relation mapping file \"%s\": %m", mapfilename))); } /* * We choose to fsync the data to disk before considering the task done. * It would be possible to relax this if it turns out to be a performance * issue, but it would complicate checkpointing --- see notes for * CheckPointRelationMap. */ if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync relation mapping file \"%s\": %m", mapfilename))); if (CloseTransientFile(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close relation mapping file \"%s\": %m", mapfilename))); /* * Now that the file is safely on disk, send sinval message to let other * backends know to re-read it. We must do this inside the critical * section: if for some reason we fail to send the message, we have to * force a database-wide PANIC. Otherwise other backends might continue * execution with stale mapping information, which would be catastrophic * as soon as others began to use the now-committed data. */ if (send_sinval) CacheInvalidateRelmap(dbid); /* * Make sure that the files listed in the map are not deleted if the outer * transaction aborts. This had better be within the critical section * too: it's not likely to fail, but if it did, we'd arrive at transaction * abort with the files still vulnerable. PANICing will leave things in a * good state on-disk. * * Note: we're cheating a little bit here by assuming that mapped files * are either in pg_global or the database's default tablespace. */ if (preserve_files) { int32 i; for (i = 0; i < newmap->num_mappings; i++) { RelFileNode rnode; rnode.spcNode = tsid; rnode.dbNode = dbid; rnode.relNode = newmap->mappings[i].mapfilenode; RelationPreserveStorage(rnode, false); } } /* Success, update permanent copy */ memcpy(realmap, newmap, sizeof(RelMapFile)); /* Critical section done */ if (write_wal) END_CRIT_SECTION(); }
/* * Reserve WAL for the currently active slot. * * Compute and set restart_lsn in a manner that's appropriate for the type of * the slot and concurrency safe. */ void ReplicationSlotReserveWal(void) { ReplicationSlot *slot = MyReplicationSlot; Assert(slot != NULL); Assert(slot->data.restart_lsn == InvalidXLogRecPtr); /* * The replication slot mechanism is used to prevent removal of required * WAL. As there is no interlock between this routine and checkpoints, WAL * segments could concurrently be removed when a now stale return value of * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that * this happens we'll just retry. */ while (true) { XLogSegNo segno; /* * For logical slots log a standby snapshot and start logical decoding * at exactly that position. That allows the slot to start up more * quickly. * * That's not needed (or indeed helpful) for physical slots as they'll * start replay at the last logged checkpoint anyway. Instead return * the location of the last redo LSN. While that slightly increases * the chance that we have to retry, it's where a base backup has to * start replay at. */ if (!RecoveryInProgress() && SlotIsLogical(slot)) { XLogRecPtr flushptr; /* start at current insert position */ slot->data.restart_lsn = GetXLogInsertRecPtr(); /* make sure we have enough information to start */ flushptr = LogStandbySnapshot(); /* and make sure it's fsynced to disk */ XLogFlush(flushptr); } else { slot->data.restart_lsn = GetRedoRecPtr(); } /* prevent WAL removal as fast as possible */ ReplicationSlotsComputeRequiredLSN(); /* * If all required WAL is still there, great, otherwise retry. The * slot should prevent further removal of WAL, unless there's a * concurrent ReplicationSlotsComputeRequiredLSN() after we've written * the new restart_lsn above, so normally we should never need to loop * more than twice. */ XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size); if (XLogGetLastRemovedSegno() < segno) break; } }
/* * RelationTruncate * Physically truncate a relation to the specified number of blocks. * * This includes getting rid of any buffers for the blocks that are to be * dropped. */ void RelationTruncate(Relation rel, BlockNumber nblocks) { bool fsm; bool vm; /* Open it at the smgr level if not already done */ RelationOpenSmgr(rel); /* * Make sure smgr_targblock etc aren't pointing somewhere past new end */ rel->rd_smgr->smgr_targblock = InvalidBlockNumber; rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber; rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber; /* Truncate the FSM first if it exists */ fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM); if (fsm) FreeSpaceMapTruncateRel(rel, nblocks); /* Truncate the visibility map too if it exists. */ vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM); if (vm) visibilitymap_truncate(rel, nblocks); /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay * likely isn't going to succeed in the truncation either, and cause a * PANIC. It's tempting to put a critical section here, but that cure * would be worse than the disease. It would turn a usually harmless * failure to truncate, that might spell trouble at WAL replay, into a * certain PANIC. */ if (!rel->rd_istemp) { /* * Make an XLOG entry reporting the file truncation. */ XLogRecPtr lsn; XLogRecData rdata; xl_smgr_truncate xlrec; xlrec.blkno = nblocks; xlrec.rnode = rel->rd_node; rdata.data = (char *) &xlrec; rdata.len = sizeof(xlrec); rdata.buffer = InvalidBuffer; rdata.next = NULL; lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE, &rdata); /* * Flush, because otherwise the truncation of the main relation might * hit the disk before the WAL record, and the truncation of the FSM * or visibility map. If we crashed during that window, we'd be left * with a truncated heap, but the FSM or visibility map would still * contain entries for the non-existent heap pages. */ if (fsm || vm) XLogFlush(lsn); } /* Do the real work */ smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks, rel->rd_istemp); }
void PersistentStore_ReplaceTuple( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData, ItemPointer persistentTid, /* TID of the stored tuple. */ HeapTuple tuple, Datum *newValues, bool *replaces, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { Relation persistentRel; bool *nulls; HeapTuple replacementTuple = NULL; XLogRecPtr xlogUpdateEndLoc; #ifdef USE_ASSERT_CHECKING if (storeSharedData == NULL || !PersistentStoreSharedData_EyecatcherIsValid(storeSharedData)) elog(ERROR, "Persistent store shared-memory not valid"); #endif if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_ReplaceTuple: Going to replace set of columns in tuple at TID %s ('%s', shared data %p)", ItemPointerToString(persistentTid), storeData->tableName, storeSharedData); persistentRel = (*storeData->openRel)(); /* * In order to keep the tuples the exact same size to enable direct reuse of * free tuples, we do not use NULLs. */ nulls = (bool*)palloc0(storeData->numAttributes * sizeof(bool)); /* * Modify the tuple. */ replacementTuple = heap_modify_tuple(tuple, persistentRel->rd_att, newValues, nulls, replaces); replacementTuple->t_self = *persistentTid; frozen_heap_inplace_update(persistentRel, replacementTuple); /* * Return the XLOG location of the UPDATE tuple's XLOG record. */ xlogUpdateEndLoc = XLogLastInsertEndLoc(); heap_freetuple(replacementTuple); pfree(nulls); if (Debug_persistent_store_print) { Datum *readValues; bool *readNulls; HeapTupleData readTuple; Buffer buffer; HeapTuple readTupleCopy; elog(PersistentStore_DebugPrintLevel(), "PersistentStore_ReplaceTuple: Replaced set of columns in tuple at TID %s ('%s')", ItemPointerToString(persistentTid), storeData->tableName); readValues = (Datum*)palloc(storeData->numAttributes * sizeof(Datum)); readNulls = (bool*)palloc(storeData->numAttributes * sizeof(bool)); readTuple.t_self = *persistentTid; if (!heap_fetch(persistentRel, SnapshotAny, &readTuple, &buffer, false, NULL)) { elog(ERROR, "Failed to fetch persistent tuple at %s ('%s')", ItemPointerToString(&readTuple.t_self), storeData->tableName); } readTupleCopy = heaptuple_copy_to(&readTuple, NULL, NULL); ReleaseBuffer(buffer); heap_deform_tuple(readTupleCopy, persistentRel->rd_att, readValues, readNulls); (*storeData->printTupleCallback)( PersistentStore_DebugPrintLevel(), "STORE REPLACED TUPLE", persistentTid, readValues); heap_freetuple(readTupleCopy); pfree(readValues); pfree(readNulls); } (*storeData->closeRel)(persistentRel); if (flushToXLog) { XLogFlush(xlogUpdateEndLoc); XLogRecPtr_Zero(&nowaitXLogEndLoc); } else nowaitXLogEndLoc = xlogUpdateEndLoc; }
/* * RecordTransactionAbortPrepared * * This is basically the same as RecordTransactionAbort. * * We know the transaction made at least one XLOG entry (its PREPARE), * so it is never possible to optimize out the abort record. */ static void RecordTransactionAbortPrepared(TransactionId xid, int nchildren, TransactionId *children, int nrels, RelFileNode *rels) { XLogRecData rdata[3]; int lastrdata = 0; xl_xact_abort_prepared xlrec; XLogRecPtr recptr; /* * Catch the scenario where we aborted partway through * RecordTransactionCommitPrepared ... */ if (TransactionIdDidCommit(xid)) elog(PANIC, "cannot abort transaction %u, it was already committed", xid); START_CRIT_SECTION(); /* Emit the XLOG abort record */ xlrec.xid = xid; xlrec.arec.xact_time = GetCurrentTimestamp(); xlrec.arec.nrels = nrels; xlrec.arec.nsubxacts = nchildren; rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfXactAbortPrepared; rdata[0].buffer = InvalidBuffer; /* dump rels to delete */ if (nrels > 0) { rdata[0].next = &(rdata[1]); rdata[1].data = (char *) rels; rdata[1].len = nrels * sizeof(RelFileNode); rdata[1].buffer = InvalidBuffer; lastrdata = 1; } /* dump committed child Xids */ if (nchildren > 0) { rdata[lastrdata].next = &(rdata[2]); rdata[2].data = (char *) children; rdata[2].len = nchildren * sizeof(TransactionId); rdata[2].buffer = InvalidBuffer; lastrdata = 2; } rdata[lastrdata].next = NULL; recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT_PREPARED, rdata); /* Always flush, since we're about to remove the 2PC state file */ XLogFlush(recptr); /* * Mark the transaction aborted in clog. This is not absolutely necessary * but we may as well do it while we are here. */ TransactionIdAbortTree(xid, nchildren, children); END_CRIT_SECTION(); }
void PersistentStore_FreeTuple( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData, ItemPointer persistentTid, /* TID of the stored tuple. */ Datum *freeValues, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { Relation persistentRel; HeapTuple persistentTuple = NULL; ItemPointerData prevFreeTid; XLogRecPtr xlogEndLoc; /* The end location of the UPDATE XLOG record. */ #ifdef USE_ASSERT_CHECKING if (storeSharedData == NULL || !PersistentStoreSharedData_EyecatcherIsValid(storeSharedData)) elog(ERROR, "Persistent store shared-memory not valid"); #endif if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_FreeTuple: Going to free tuple at TID %s ('%s', shared data %p)", ItemPointerToString(persistentTid), storeData->tableName, storeSharedData); Assert(persistentTid->ip_posid != 0); persistentRel = (*storeData->openRel)(); storeSharedData->maxFreeOrderNum++; if (storeSharedData->maxFreeOrderNum == 1) prevFreeTid = *persistentTid; // So non-zero PreviousFreeTid indicates free. else prevFreeTid = storeSharedData->freeTid; storeSharedData->freeTid = *persistentTid; PersistentStore_FormTupleSetOurs( storeData, persistentRel->rd_att, freeValues, storeSharedData->maxFreeOrderNum, &prevFreeTid, &persistentTuple); persistentTuple->t_self = *persistentTid; frozen_heap_inplace_update(persistentRel, persistentTuple); /* * XLOG location of the UPDATE tuple's XLOG record. */ xlogEndLoc = XLogLastInsertEndLoc(); heap_freetuple(persistentTuple); (*storeData->closeRel)(persistentRel); storeSharedData->inUseCount--; if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_FreeTuple: Freed tuple at TID %s. Maximum free order number " INT64_FORMAT ", in use count " INT64_FORMAT " ('%s')", ItemPointerToString(&storeSharedData->freeTid), storeSharedData->maxFreeOrderNum, storeSharedData->inUseCount, storeData->tableName); if (flushToXLog) { XLogFlush(xlogEndLoc); XLogRecPtr_Zero(&nowaitXLogEndLoc); } else nowaitXLogEndLoc = xlogEndLoc; }
/* * RecordTransactionCommitPrepared * * This is basically the same as RecordTransactionCommit: in particular, * we must take the CheckpointStartLock to avoid a race condition. * * We know the transaction made at least one XLOG entry (its PREPARE), * so it is never possible to optimize out the commit record. */ static void RecordTransactionCommitPrepared(TransactionId xid, int nchildren, TransactionId *children, int nrels, RelFileNode *rels) { XLogRecData rdata[3]; int lastrdata = 0; xl_xact_commit_prepared xlrec; XLogRecPtr recptr; START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ LWLockAcquire(CheckpointStartLock, LW_SHARED); /* Emit the XLOG commit record */ xlrec.xid = xid; xlrec.crec.xtime = time(NULL); xlrec.crec.nrels = nrels; xlrec.crec.nsubxacts = nchildren; rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfXactCommitPrepared; rdata[0].buffer = InvalidBuffer; /* dump rels to delete */ if (nrels > 0) { rdata[0].next = &(rdata[1]); rdata[1].data = (char *) rels; rdata[1].len = nrels * sizeof(RelFileNode); rdata[1].buffer = InvalidBuffer; lastrdata = 1; } /* dump committed child Xids */ if (nchildren > 0) { rdata[lastrdata].next = &(rdata[2]); rdata[2].data = (char *) children; rdata[2].len = nchildren * sizeof(TransactionId); rdata[2].buffer = InvalidBuffer; lastrdata = 2; } rdata[lastrdata].next = NULL; recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_PREPARED | XLOG_NO_TRAN, rdata); /* we don't currently try to sleep before flush here ... */ /* Flush XLOG to disk */ XLogFlush(recptr); /* Mark the transaction committed in pg_clog */ TransactionIdCommit(xid); /* to avoid race conditions, the parent must commit first */ TransactionIdCommitTree(nchildren, children); /* Checkpoint is allowed again */ LWLockRelease(CheckpointStartLock); END_CRIT_SECTION(); }
void smgr_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; /* Backup blocks are not used in smgr records */ Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); if (info == XLOG_SMGR_CREATE) { xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; reln = smgropen(xlrec->rnode, InvalidBackendId); smgrcreate(reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); SMgrRelation reln; Relation rel; reln = smgropen(xlrec->rnode, InvalidBackendId); /* * Forcibly create relation if it doesn't exist (which suggests that * it was dropped somewhere later in the WAL sequence). As in * XLogReadBuffer, we prefer to recreate the rel and replay the log as * best we can until the drop is seen. */ smgrcreate(reln, MAIN_FORKNUM, true); /* * Before we perform the truncation, update minimum recovery point * to cover this WAL record. Once the relation is truncated, there's * no going back. The buffer manager enforces the WAL-first rule * for normal updates to relation files, so that the minimum recovery * point is always updated before the corresponding change in the * data file is flushed to disk. We have to do the same manually * here. * * Doing this before the truncation means that if the truncation fails * for some reason, you cannot start up the system even after restart, * until you fix the underlying situation so that the truncation will * succeed. Alternatively, we could update the minimum recovery point * after truncation, but that would leave a small window where the * WAL-first rule could be violated. */ XLogFlush(lsn); smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno); /* Also tell xlogutils.c about it */ XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno); /* Truncate FSM and VM too */ rel = CreateFakeRelcacheEntry(xlrec->rnode); if (smgrexists(reln, FSM_FORKNUM)) FreeSpaceMapTruncateRel(rel, xlrec->blkno); if (smgrexists(reln, VISIBILITYMAP_FORKNUM)) visibilitymap_truncate(rel, xlrec->blkno); FreeFakeRelcacheEntry(rel); } else elog(PANIC, "smgr_redo: unknown op code %u", info); }
/* * RecordTransactionCommit */ void RecordTransactionCommit(void) { /* * If we made neither any XLOG entries nor any temp-rel updates, we * can omit recording the transaction commit at all. */ if (MyXactMadeXLogEntry || MyXactMadeTempRelUpdate) { TransactionId xid = GetCurrentTransactionId(); bool madeTCentries; XLogRecPtr recptr; /* Tell bufmgr and smgr to prepare for commit */ BufmgrCommit(); START_CRIT_SECTION(); /* * If our transaction made any transaction-controlled XLOG entries, * we need to lock out checkpoint start between writing our XLOG * record and updating pg_clog. Otherwise it is possible for the * checkpoint to set REDO after the XLOG record but fail to flush the * pg_clog update to disk, leading to loss of the transaction commit * if we crash a little later. Slightly klugy fix for problem * discovered 2004-08-10. * * (If it made no transaction-controlled XLOG entries, its XID * appears nowhere in permanent storage, so no one else will ever care * if it committed; so it doesn't matter if we lose the commit flag.) * * Note we only need a shared lock. */ madeTCentries = (MyLastRecPtr.xrecoff != 0); if (madeTCentries) LWLockAcquire(CheckpointStartLock, LW_SHARED); /* * We only need to log the commit in XLOG if the transaction made * any transaction-controlled XLOG entries. */ if (madeTCentries) { /* Need to emit a commit record */ XLogRecData rdata; xl_xact_commit xlrec; xlrec.xtime = time(NULL); rdata.buffer = InvalidBuffer; rdata.data = (char *) (&xlrec); rdata.len = SizeOfXactCommit; rdata.next = NULL; /* * XXX SHOULD SAVE ARRAY OF RELFILENODE-s TO DROP */ recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, &rdata); } else { /* Just flush through last record written by me */ recptr = ProcLastRecEnd; } /* * We must flush our XLOG entries to disk if we made any XLOG * entries, whether in or out of transaction control. For * example, if we reported a nextval() result to the client, this * ensures that any XLOG record generated by nextval will hit the * disk before we report the transaction committed. */ if (MyXactMadeXLogEntry) { /* * Sleep before flush! So we can flush more than one commit * records per single fsync. (The idea is some other backend * may do the XLogFlush while we're sleeping. This needs work * still, because on most Unixen, the minimum select() delay * is 10msec or more, which is way too long.) * * We do not sleep if enableFsync is not turned on, nor if there * are fewer than CommitSiblings other backends with active * transactions. */ if (CommitDelay > 0 && enableFsync && CountActiveBackends() >= CommitSiblings) { struct timeval delay; delay.tv_sec = 0; delay.tv_usec = CommitDelay; (void) select(0, NULL, NULL, NULL, &delay); } XLogFlush(recptr); } /* * We must mark the transaction committed in clog if its XID * appears either in permanent rels or in local temporary rels. We * test this by seeing if we made transaction-controlled entries * *OR* local-rel tuple updates. Note that if we made only the * latter, we have not emitted an XLOG record for our commit, and * so in the event of a crash the clog update might be lost. This * is okay because no one else will ever care whether we * committed. */ if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate) TransactionIdCommit(xid); /* Unlock checkpoint lock if we acquired it */ if (madeTCentries) LWLockRelease(CheckpointStartLock); END_CRIT_SECTION(); } /* Break the chain of back-links in the XLOG records I output */ MyLastRecPtr.xrecoff = 0; MyXactMadeXLogEntry = false; MyXactMadeTempRelUpdate = false; /* Show myself as out of the transaction in PGPROC array */ MyProc->logRec.xrecoff = 0; }
/* * Physical write of a page from a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during * SimpleLruFlush, though. * * fdata is NULL for a standalone write, pointer to open-file info during * SimpleLruFlush. */ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; struct timeval tv; /* * Honor the write-WAL-before-data rule, if appropriate, so that we do not * write out data before associated WAL records. This is the same action * performed during FlushBuffer() in the main buffer manager. */ if (shared->group_lsn != NULL) { /* * We must determine the largest async-commit LSN for the page. This * is a bit tedious, but since this entire function is a slow path * anyway, it seems better to do this here than to maintain a per-page * LSN variable (which'd need an extra comparison in the * transaction-commit path). */ XLogRecPtr max_lsn; int lsnindex, lsnoff; lsnindex = slotno * shared->lsn_groups_per_page; max_lsn = shared->group_lsn[lsnindex++]; for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) { XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; if (XLByteLT(max_lsn, this_lsn)) max_lsn = this_lsn; } if (!XLogRecPtrIsInvalid(max_lsn)) { /* * As noted above, elog(ERROR) is not acceptable here, so if * XLogFlush were to fail, we must PANIC. This isn't much of a * restriction because XLogFlush is just about all critical * section anyway, but let's make sure. */ START_CRIT_SECTION(); XLogFlush(max_lsn); END_CRIT_SECTION(); } } /* * During a Flush, we may already have the desired file open. */ if (fdata) { int i; for (i = 0; i < fdata->num_files; i++) { if (fdata->segno[i] == segno) { fd = fdata->fd[i]; break; } } } if (fd < 0) { /* * If the file doesn't already exist, we should create it. It is * possible for this to need to happen when writing a page that's not * first in its segment; we assume the OS can cope with that. (Note: * it might seem that it'd be okay to create files only when * SimpleLruZeroPage is called for the first page of a segment. * However, if after a crash and restart the REDO logic elects to * replay the log from a checkpoint before the latest one, then it's * possible that we will get commands to set transaction status of * transactions that have already been truncated from the commit log. * Easiest way to deal with that is to accept references to * nonexistent files here and in SlruPhysicalReadPage.) * * Note: it is possible for more than one backend to be executing this * code simultaneously for different pages of the same file. Hence, * don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } if (fdata) { if (fdata->num_files < MAX_FLUSH_BUFFERS) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; fdata->num_files++; } else { /* * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, * fall back to treating it as a standalone write. */ fdata = NULL; } } } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } errno = 0; if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } #ifdef XP_TRACE_LRU_WRITE gettimeofday(&tv, NULL); ereport(TRACE_LEVEL, (errmsg("%ld.%ld:\tWRITE:\tSlruPhysicalWritePage:\tfile:%s", tv.tv_sec, tv.tv_usec, path))); #endif /* * If not part of Flush, need to fsync now. We assume this happens * infrequently enough that it's not a performance issue. */ if (!fdata) { if (ctl->do_fsync && pg_fsync(fd)) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; close(fd); return false; } if (close(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } } return true; }
/* * RecordTransactionCommitPrepared * * This is basically the same as RecordTransactionCommit: in particular, * we must set the inCommit flag to avoid a race condition. * * We know the transaction made at least one XLOG entry (its PREPARE), * so it is never possible to optimize out the commit record. */ static void RecordTransactionCommitPrepared(TransactionId xid, int nchildren, TransactionId *children, int nrels, RelFileNode *rels) { XLogRecData rdata[3]; int lastrdata = 0; xl_xact_commit_prepared xlrec; XLogRecPtr recptr; START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ MyProc->inCommit = true; /* Emit the XLOG commit record */ xlrec.xid = xid; xlrec.crec.xact_time = GetCurrentTimestamp(); xlrec.crec.nrels = nrels; xlrec.crec.nsubxacts = nchildren; rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfXactCommitPrepared; rdata[0].buffer = InvalidBuffer; /* dump rels to delete */ if (nrels > 0) { rdata[0].next = &(rdata[1]); rdata[1].data = (char *) rels; rdata[1].len = nrels * sizeof(RelFileNode); rdata[1].buffer = InvalidBuffer; lastrdata = 1; } /* dump committed child Xids */ if (nchildren > 0) { rdata[lastrdata].next = &(rdata[2]); rdata[2].data = (char *) children; rdata[2].len = nchildren * sizeof(TransactionId); rdata[2].buffer = InvalidBuffer; lastrdata = 2; } rdata[lastrdata].next = NULL; recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_PREPARED, rdata); /* * We don't currently try to sleep before flush here ... nor is there any * support for async commit of a prepared xact (the very idea is probably * a contradiction) */ /* Flush XLOG to disk */ XLogFlush(recptr); /* Mark the transaction committed in pg_clog */ TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ MyProc->inCommit = false; END_CRIT_SECTION(); }
static void PersistentStore_DoInsertTuple( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData, Relation persistentRel, /* The persistent table relation. */ Datum *values, bool flushToXLog, /* When true, the XLOG record for this change will be flushed to disk. */ ItemPointer persistentTid) /* TID of the stored tuple. */ { bool *nulls; HeapTuple persistentTuple = NULL; XLogRecPtr xlogInsertEndLoc; /* * In order to keep the tuples the exact same size to enable direct reuse of * free tuples, we do not use NULLs. */ nulls = (bool*)palloc0(storeData->numAttributes * sizeof(bool)); /* * Form the tuple. */ persistentTuple = heap_form_tuple(persistentRel->rd_att, values, nulls); if (!HeapTupleIsValid(persistentTuple)) elog(ERROR, "Failed to build persistent tuple ('%s')", storeData->tableName); /* * (We have an exclusive lock (higher up) here so we can direct the insert to the last page.) */ { // Do not assert valid ItemPointer -- it is ok if it is (0,0)... BlockNumber blockNumber = BlockIdGetBlockNumber( &storeSharedData->maxTid.ip_blkid); frozen_heap_insert_directed( persistentRel, persistentTuple, blockNumber); } if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_DoInsertTuple: old maximum known TID %s, new insert TID %s ('%s')", ItemPointerToString(&storeSharedData->maxTid), ItemPointerToString2(&persistentTuple->t_self), storeData->tableName); if (ItemPointerCompare( &storeSharedData->maxTid, &persistentTuple->t_self) == -1) { // Current max is Less-Than. storeSharedData->maxTid = persistentTuple->t_self; } /* * Return the TID of the INSERT tuple. * Return the XLOG location of the INSERT tuple's XLOG record. */ *persistentTid = persistentTuple->t_self; xlogInsertEndLoc = XLogLastInsertEndLoc(); heap_freetuple(persistentTuple); if (flushToXLog) { XLogFlush(xlogInsertEndLoc); XLogRecPtr_Zero(&nowaitXLogEndLoc); } else nowaitXLogEndLoc = xlogInsertEndLoc; pfree(nulls); }
/* * Finish preparing state file. * * Calculates CRC and writes state file to WAL and in pg_twophase directory. */ void EndPrepare(GlobalTransaction gxact) { TransactionId xid = gxact->proc.xid; TwoPhaseFileHeader *hdr; char path[MAXPGPATH]; XLogRecData *record; pg_crc32 statefile_crc; pg_crc32 bogus_crc; int fd; /* Add the end sentinel to the list of 2PC records */ RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0, NULL, 0); /* Go back and fill in total_len in the file header record */ hdr = (TwoPhaseFileHeader *) records.head->data; Assert(hdr->magic == TWOPHASE_MAGIC); hdr->total_len = records.total_len + sizeof(pg_crc32); /* * If the file size exceeds MaxAllocSize, we won't be able to read it in * ReadTwoPhaseFile. Check for that now, rather than fail at commit time. */ if (hdr->total_len > MaxAllocSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("two-phase state file maximum length exceeded"))); /* * Create the 2PC state file. * * Note: because we use BasicOpenFile(), we are responsible for ensuring * the FD gets closed in any error exit path. Once we get into the * critical section, though, it doesn't matter since any failure causes * PANIC anyway. */ TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create two-phase state file \"%s\": %m", path))); /* Write data to file, and calculate CRC as we pass over it */ INIT_CRC32(statefile_crc); for (record = records.head; record != NULL; record = record->next) { COMP_CRC32(statefile_crc, record->data, record->len); if ((write(fd, record->data, record->len)) != record->len) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } } FIN_CRC32(statefile_crc); /* * Write a deliberately bogus CRC to the state file; this is just paranoia * to catch the case where four more bytes will run us out of disk space. */ bogus_crc = ~statefile_crc; if ((write(fd, &bogus_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } /* Back up to prepare for rewriting the CRC */ if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in two-phase state file: %m"))); } /* * The state file isn't valid yet, because we haven't written the correct * CRC yet. Before we do that, insert entry in WAL and flush it to disk. * * Between the time we have written the WAL entry and the time we write * out the correct state file CRC, we have an inconsistency: the xact is * prepared according to WAL but not according to our on-disk state. We * use a critical section to force a PANIC if we are unable to complete * the write --- then, WAL replay should repair the inconsistency. The * odds of a PANIC actually occurring should be very tiny given that we * were able to write the bogus CRC above. * * We have to set inCommit here, too; otherwise a checkpoint starting * immediately after the WAL record is inserted could complete without * fsync'ing our state file. (This is essentially the same kind of race * condition as the COMMIT-to-clog-write case that RecordTransactionCommit * uses inCommit for; see notes there.) * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. */ START_CRIT_SECTION(); MyProc->inCommit = true; gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, records.head); XLogFlush(gxact->prepare_lsn); /* If we crash now, we have prepared: WAL replay will fix things */ /* write correct CRC and close file */ if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file: %m"))); /* * Mark the prepared transaction as valid. As soon as xact.c marks MyProc * as not running our XID (which it will do immediately after this * function returns), others can commit/rollback the xact. * * NB: a side effect of this is to make a dummy ProcArray entry for the * prepared XID. This must happen before we clear the XID from MyProc, * else there is a window where the XID is not running according to * TransactionIdIsInProgress, and onlookers would be entitled to assume * the xact crashed. Instead we have a window where the same XID appears * twice in ProcArray, which is OK. */ MarkAsPrepared(gxact); /* * Now we can mark ourselves as out of the commit critical section: a * checkpoint starting after this will certainly see the gxact as a * candidate for fsyncing. */ MyProc->inCommit = false; END_CRIT_SECTION(); records.tail = records.head = NULL; }
void PersistentStore_UpdateTuple( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData, ItemPointer persistentTid, /* TID of the stored tuple. */ Datum *values, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { Relation persistentRel; bool *nulls; HeapTuple persistentTuple = NULL; XLogRecPtr xlogUpdateEndLoc; #ifdef USE_ASSERT_CHECKING if (storeSharedData == NULL || !PersistentStoreSharedData_EyecatcherIsValid(storeSharedData)) elog(ERROR, "Persistent store shared-memory not valid"); #endif if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_ReplaceTuple: Going to update whole tuple at TID %s ('%s', shared data %p)", ItemPointerToString(persistentTid), storeData->tableName, storeSharedData); persistentRel = (*storeData->openRel)(); /* * In order to keep the tuples the exact same size to enable direct reuse of * free tuples, we do not use NULLs. */ nulls = (bool*)palloc0(storeData->numAttributes * sizeof(bool)); /* * Form the tuple. */ persistentTuple = heap_form_tuple(persistentRel->rd_att, values, nulls); if (!HeapTupleIsValid(persistentTuple)) elog(ERROR, "Failed to build persistent tuple ('%s')", storeData->tableName); persistentTuple->t_self = *persistentTid; frozen_heap_inplace_update(persistentRel, persistentTuple); /* * Return the XLOG location of the UPDATE tuple's XLOG record. */ xlogUpdateEndLoc = XLogLastInsertEndLoc(); heap_freetuple(persistentTuple); #ifdef FAULT_INJECTOR if (FaultInjector_InjectFaultIfSet(SyncPersistentTable, DDLNotSpecified, "" /* databaseName */, "" /* tableName */)== FaultInjectorTypeSkip) { FlushRelationBuffers(persistentRel); smgrimmedsync(persistentRel->rd_smgr); } #endif (*storeData->closeRel)(persistentRel); if (Debug_persistent_store_print) { elog(PersistentStore_DebugPrintLevel(), "PersistentStore_UpdateTuple: Updated whole tuple at TID %s ('%s')", ItemPointerToString(persistentTid), storeData->tableName); (*storeData->printTupleCallback)( PersistentStore_DebugPrintLevel(), "STORE UPDATED TUPLE", persistentTid, values); } if (flushToXLog) { XLogFlush(xlogUpdateEndLoc); XLogRecPtr_Zero(&nowaitXLogEndLoc); } else nowaitXLogEndLoc = xlogUpdateEndLoc; }
/* * Execute the barrier command on all the components, including Datanodes and * Coordinators. */ static void ExecuteBarrier(const char *id) { List *barrierDataNodeList = GetAllDataNodes(); List *barrierCoordList = GetAllCoordNodes(); PGXCNodeAllHandles *conn_handles; int conn; int msglen; int barrier_idlen; conn_handles = get_handles(barrierDataNodeList, barrierCoordList, false, true); elog(DEBUG2, "Sending CREATE BARRIER <%s> EXECUTE message to " "Datanodes and Coordinator", id); /* * Send a CREATE BARRIER request to all the Datanodes and the Coordinators */ for (conn = 0; conn < conn_handles->co_conn_count + conn_handles->dn_conn_count; conn++) { PGXCNodeHandle *handle; if (conn < conn_handles->co_conn_count) handle = conn_handles->coord_handles[conn]; else handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count]; /* Invalid connection state, return error */ if (handle->state != DN_CONNECTION_STATE_IDLE) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send CREATE BARRIER EXECUTE request " "to the node"))); barrier_idlen = strlen(id) + 1; msglen = 4; /* for the length itself */ msglen += barrier_idlen; msglen += 1; /* for barrier command itself */ /* msgType + msgLen */ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Out of memory"))); } handle->outBuffer[handle->outEnd++] = 'b'; msglen = htonl(msglen); memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); handle->outEnd += 4; handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_EXECUTE; memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); handle->outEnd += barrier_idlen; handle->state = DN_CONNECTION_STATE_QUERY; pgxc_node_flush(handle); } CheckBarrierCommandStatus(conn_handles, id, "EXECUTE"); pfree_pgxc_all_handles(conn_handles); /* * Also WAL log the BARRIER locally and flush the WAL buffers to disk */ { XLogRecData rdata[1]; XLogRecPtr recptr; rdata[0].data = (char *) id; rdata[0].len = strlen(id) + 1; rdata[0].buffer = InvalidBuffer; rdata[0].next = NULL; recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE, rdata); XLogFlush(recptr); } }
/** * @brief Write block buffer contents. Number of block buffer to be * written is specified by num argument. * * Flow: * <ol> * <li>If no more space is available in the data file, switch to a new one.</li> * <li>Compute block number which can be written to the current file.</li> * <li>Save the last block number in the load status file.</li> * <li>Write to the current file.</li> * <li>If there are other data, write them too.</li> * </ol> * * @param loader [in] Direct Writer. * @return File descriptor for the current data file. */ static void flush_pages(DirectWriter *loader) { int i; int num; LoadStatus *ls = &loader->ls; num = loader->curblk; if (!PageIsEmpty(GetCurrentPage(loader))) num += 1; if (num <= 0) return; /* no work */ /* * Add WAL entry (only the first page) to ensure the current xid will * be recorded in xlog. We must flush some xlog records with XLogFlush() * before write any data blocks to follow the WAL protocol. * * If postgres process, such as loader and COPY, is killed by "kill -9", * database will be rewound to the last checkpoint and recovery will * be performed using WAL. * * After the recovery, if there are xid's which have not been recorded * to WAL, such xid's will be reused. * * However, in the loader and COPY, data file is actually updated and * xid must not be reused. * * WAL entry with such xid can be added using XLogInsert(). However, * such entries are not really written to the disk immediately. * WAL entries are flushed to the disk by XLogFlush(), typically * when a transaction is commited. COPY prevents xid reuse by * this method. */ #if PG_VERSION_NUM >= 90100 if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel) && !(loader->base.rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) ) { XLogRecPtr recptr; recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM, ls->ls.exist_cnt, loader->blocks); XLogFlush(recptr); } #else if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel) ) { XLogRecPtr recptr; recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM, ls->ls.exist_cnt, loader->blocks); XLogFlush(recptr); } #endif /* * Write blocks. We might need to write multiple files on boundary of * relation segments. */ for (i = 0; i < num;) { char *buffer; int total; int written; int flush_num; BlockNumber relblks = LS_TOTAL_CNT(ls); /* Switch to the next file if the current file has been filled up. */ if (relblks % RELSEG_SIZE == 0) close_data_file(loader); if (loader->datafd == -1) loader->datafd = open_data_file(ls->ls.rnode, RELATION_IS_LOCAL(loader->base.rel), relblks); /* Number of blocks to be added to the current file. */ flush_num = Min(num - i, RELSEG_SIZE - relblks % RELSEG_SIZE); Assert(flush_num > 0); /* Write the last block number to the load status file. */ UpdateLSF(loader, flush_num); #if PG_VERSION_NUM >= 90300 /* If we need a checksum, add it */ if (DataChecksumsEnabled()){ int j = 0; Page contained_page; for ( j=0; j<flush_num; j++ ) { contained_page = GetTargetPage(loader,j); ((PageHeader) contained_page)->pd_checksum = pg_checksum_page((char *) contained_page, LS_TOTAL_CNT(ls) - 1 - j); } } #endif /* * Flush flush_num data block to the current file. * Then the current file size becomes RELSEG_SIZE self->blocks. */ buffer = loader->blocks + BLCKSZ * i; total = BLCKSZ * flush_num; written = 0; while (total > 0) { int len = write(loader->datafd, buffer + written, total); if (len == -1) { /* fatal error, do not want to write blocks anymore */ ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to data file: %m"))); } written += len; total -= len; } i += flush_num; } /* * NOTICE: Be sure reset curblk to 0 and reinitialize recycled page * if you will continue to use blocks. */ }