/* * mdimmedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. */ bool mdimmedsync(SMgrRelation reln) { MdfdVec *v; BlockNumber curnblk; /* * NOTE: mdnblocks makes sure we have opened all existing segments, so * that fsync loop will get them all! */ curnblk = mdnblocks(reln); if (curnblk == InvalidBlockNumber) return false; /* mdnblocks failed */ v = mdopen(reln, false); #ifndef LET_OS_MANAGE_FILESIZE while (v != NULL) { if (FileSync(v->mdfd_vfd) < 0) return false; v = v->mdfd_chain; } #else if (FileSync(v->mdfd_vfd) < 0) return false; #endif return true; }
/* * register_dirty_segment() -- Mark a relation segment as needing fsync * * If there is a local pending-ops table, just make an entry in it for * mdsync to process later. Otherwise, try to pass off the fsync request * to the background writer process. If that fails, just do the fsync * locally before returning (we expect this will not happen often enough * to be a performance problem). * * A false result implies I/O failure during local fsync. errno will be * valid for error reporting. */ static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg) { if (pendingOpsTable) { PendingOperationEntry entry; /* ensure any pad bytes in the struct are zeroed */ MemSet(&entry, 0, sizeof(entry)); entry.rnode = reln->smgr_rnode; entry.segno = seg->mdfd_segno; (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL); return true; } else { if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno)) return true; } if (FileSync(seg->mdfd_vfd) < 0) return false; return true; }
int FileTruncate(File file, int offset) { int returnCode; DO_DB(printf("DEBUG: FileTruncate %d (%s)\n", file, VfdCache[file].fileName)); (void) FileSync(file); (void) FileAccess(file); returnCode = ftruncate(VfdCache[file].fd, offset); return(returnCode); }
/* * register_dirty_segment() -- Mark a relation segment as needing fsync * * If there is a local pending-ops table, just make an entry in it for * mdsync to process later. Otherwise, try to pass off the fsync request * to the background writer process. If that fails, just do the fsync * locally before returning (we expect this will not happen often enough * to be a performance problem). * * A false result implies I/O failure during local fsync. errno will be * valid for error reporting. */ static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg) { if (pendingOpsTable) { /* push it into local pending-ops table */ RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno); return true; } else { if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno)) return true; } if (FileSync(seg->mdfd_vfd) < 0) return false; return true; }
/* * mdimmedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. */ void mdimmedsync(SMgrRelation reln, ForkNumber forknum) { MdfdVec *v; BlockNumber curnblk; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that * fsync loop will get them all! */ curnblk = mdnblocks(reln, forknum); v = mdopen(reln, forknum, EXTENSION_FAIL); while (v != NULL) { if (FileSync(v->mdfd_vfd) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", FilePathName(v->mdfd_vfd)))); v = v->mdfd_chain; } }
/* * mdsync() -- Sync previous writes to stable storage. */ void mdsync(void) { static bool mdsync_in_progress = false; HASH_SEQ_STATUS hstat; PendingOperationEntry *entry; int absorb_counter; /* * This is only called during checkpoints, and checkpoints should only * occur in processes that have created a pendingOpsTable. */ if (!pendingOpsTable) elog(ERROR, "cannot sync without a pendingOpsTable"); /* * If we are in the bgwriter, the sync had better include all fsync * requests that were queued by backends up to this point. The tightest * race condition that could occur is that a buffer that must be written * and fsync'd for the checkpoint could have been dumped by a backend just * before it was visited by BufferSync(). We know the backend will have * queued an fsync request before clearing the buffer's dirtybit, so we * are safe as long as we do an Absorb after completing BufferSync(). */ AbsorbFsyncRequests(); /* * To avoid excess fsync'ing (in the worst case, maybe a never-terminating * checkpoint), we want to ignore fsync requests that are entered into the * hashtable after this point --- they should be processed next time, * instead. We use mdsync_cycle_ctr to tell old entries apart from new * ones: new ones will have cycle_ctr equal to the incremented value of * mdsync_cycle_ctr. * * In normal circumstances, all entries present in the table at this point * will have cycle_ctr exactly equal to the current (about to be old) * value of mdsync_cycle_ctr. However, if we fail partway through the * fsync'ing loop, then older values of cycle_ctr might remain when we * come back here to try again. Repeated checkpoint failures would * eventually wrap the counter around to the point where an old entry * might appear new, causing us to skip it, possibly allowing a checkpoint * to succeed that should not have. To forestall wraparound, any time the * previous mdsync() failed to complete, run through the table and * forcibly set cycle_ctr = mdsync_cycle_ctr. * * Think not to merge this loop with the main loop, as the problem is * exactly that that loop may fail before having visited all the entries. * From a performance point of view it doesn't matter anyway, as this path * will never be taken in a system that's functioning normally. */ if (mdsync_in_progress) { /* prior try failed, so update any stale cycle_ctr values */ hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { entry->cycle_ctr = mdsync_cycle_ctr; } } /* Advance counter so that new hashtable entries are distinguishable */ mdsync_cycle_ctr++; /* Set flag to detect failure if we don't reach the end of the loop */ mdsync_in_progress = true; /* Now scan the hashtable for fsync requests to process */ absorb_counter = FSYNCS_PER_ABSORB; hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { /* * If the entry is new then don't process it this time. Note that * "continue" bypasses the hash-remove call at the bottom of the loop. */ if (entry->cycle_ctr == mdsync_cycle_ctr) continue; /* Else assert we haven't missed it */ Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); /* * If fsync is off then we don't have to bother opening the file at * all. (We delay checking until this point so that changing fsync on * the fly behaves sensibly.) Also, if the entry is marked canceled, * fall through to delete it. */ if (enableFsync && !entry->canceled) { int failures; /* * If in bgwriter, we want to absorb pending requests every so * often to prevent overflow of the fsync request queue. It is * unspecified whether newly-added entries will be visited by * hash_seq_search, but we don't care since we don't need to * process them anyway. */ if (--absorb_counter <= 0) { AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; } /* * The fsync table could contain requests to fsync segments that * have been deleted (unlinked) by the time we get to them. Rather * than just hoping an ENOENT (or EACCES on Windows) error can be * ignored, what we do on error is absorb pending requests and * then retry. Since mdunlink() queues a "revoke" message before * actually unlinking, the fsync request is guaranteed to be * marked canceled after the absorb if it really was this case. * DROP DATABASE likewise has to tell us to forget fsync requests * before it starts deletions. */ for (failures = 0;; failures++) /* loop exits at "break" */ { SMgrRelation reln; MdfdVec *seg; char *path; /* * Find or create an smgr hash entry for this relation. This * may seem a bit unclean -- md calling smgr? But it's really * the best solution. It ensures that the open file reference * isn't permanently leaked if we get an error here. (You may * say "but an unreferenced SMgrRelation is still a leak!" Not * really, because the only case in which a checkpoint is done * by a process that isn't about to shut down is in the * bgwriter, and it will periodically do smgrcloseall(). This * fact justifies our not closing the reln in the success path * either, which is a good thing since in non-bgwriter cases * we couldn't safely do that.) Furthermore, in many cases * the relation will have been dirtied through this same smgr * relation, and so we can save a file open/close cycle. */ reln = smgropen(entry->tag.rnode); /* * It is possible that the relation has been dropped or * truncated since the fsync request was entered. Therefore, * allow ENOENT, but only if we didn't fail already on this * file. This applies both during _mdfd_getseg() and during * FileSync, since fd.c might have closed the file behind our * back. */ seg = _mdfd_getseg(reln, entry->tag.forknum, entry->tag.segno * ((BlockNumber) RELSEG_SIZE), false, EXTENSION_RETURN_NULL); if (seg != NULL && FileSync(seg->mdfd_vfd) >= 0) break; /* success; break out of retry loop */ /* * XXX is there any point in allowing more than one retry? * Don't see one at the moment, but easy to change the test * here if so. */ path = _mdfd_segpath(reln, entry->tag.forknum, entry->tag.segno); if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); else ereport(DEBUG1, (errcode_for_file_access(), errmsg("could not fsync file \"%s\" but retrying: %m", path))); pfree(path); /* * Absorb incoming requests and check to see if canceled. */ AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ if (entry->canceled) break; } /* end retry loop */ } /* * If we get here, either we fsync'd successfully, or we don't have to * because enableFsync is off, or the entry is (now) marked canceled. * Okay to delete it. */ if (hash_search(pendingOpsTable, &entry->tag, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); } /* end loop over hashtable entries */ /* Flag successful completion of mdsync */ mdsync_in_progress = false; }
/* * mdsync() -- Sync previous writes to stable storage. * * This is only called during checkpoints, and checkpoints should only * occur in processes that have created a pendingOpsTable. */ bool mdsync(void) { HASH_SEQ_STATUS hstat; PendingOperationEntry *entry; if (!pendingOpsTable) return false; /* * If we are in the bgwriter, the sync had better include all fsync * requests that were queued by backends before the checkpoint REDO point * was determined. We go that a little better by accepting all requests * queued up to the point where we start fsync'ing. */ AbsorbFsyncRequests(); hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { /* * If fsync is off then we don't have to bother opening the file at * all. (We delay checking until this point so that changing fsync on * the fly behaves sensibly.) */ if (enableFsync) { SMgrRelation reln; MdfdVec *seg; /* * Find or create an smgr hash entry for this relation. This may * seem a bit unclean -- md calling smgr? But it's really the * best solution. It ensures that the open file reference isn't * permanently leaked if we get an error here. (You may say "but * an unreferenced SMgrRelation is still a leak!" Not really, * because the only case in which a checkpoint is done by a * process that isn't about to shut down is in the bgwriter, and * it will periodically do smgrcloseall(). This fact justifies * our not closing the reln in the success path either, which is a * good thing since in non-bgwriter cases we couldn't safely do * that.) Furthermore, in many cases the relation will have been * dirtied through this same smgr relation, and so we can save a * file open/close cycle. */ reln = smgropen(entry->rnode); /* * It is possible that the relation has been dropped or truncated * since the fsync request was entered. Therefore, we have to * allow file-not-found errors. This applies both during * _mdfd_getseg() and during FileSync, since fd.c might have * closed the file behind our back. */ seg = _mdfd_getseg(reln, entry->segno * ((BlockNumber) RELSEG_SIZE), true); if (seg) { if (FileSync(seg->mdfd_vfd) < 0 && errno != ENOENT) { ereport(LOG, (errcode_for_file_access(), errmsg("could not fsync segment %u of relation %u/%u/%u: %m", entry->segno, entry->rnode.spcNode, entry->rnode.dbNode, entry->rnode.relNode))); return false; } } } /* Okay, delete this entry */ if (hash_search(pendingOpsTable, entry, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); } return true; }
/* * Flush a flat file. * */ bool MirroredBufferPool_Flush( MirroredBufferPoolOpen *open) /* The open struct. */ { int primaryError; FileRepGpmonRecord_s gpmonRecord; Assert(open != NULL); Assert(open->isActive); primaryError = 0; /* * For Buffer Pool managed, we are normally not session oriented like Append-Only. * * Figure out mirroring each time... */ MirroredBufferPool_RecheckMirrorAccess(open); if (StorageManagerMirrorMode_SendToMirror(open->mirrorMode) && !open->mirrorDataLossOccurred) { if (fileRepRole == FileRepPrimaryRole) { FileRepGpmonStat_OpenRecord( FileRepGpmonStatType_PrimaryRoundtripFsyncMsg, &gpmonRecord); } if (FileRepPrimary_MirrorFlush( FileRep_GetRelationIdentifier( open->mirrorFilespaceLocation, open->relFileNode, open->segmentFileNum), FileRepRelationTypeBufferPool) != 0) { if (Debug_filerep_print) ereport(LOG, (errmsg("could not sent file fsync request to mirror "), FileRep_ReportRelationPath( open->mirrorFilespaceLocation, open->relFileNode, open->segmentFileNum))); } open->mirrorDataLossOccurred = FileRepPrimary_IsMirrorDataLossOccurred(); } if (StorageManagerMirrorMode_DoPrimaryWork(open->mirrorMode) && ! FileRepResyncWorker_IsResyncRequest()) { errno = 0; if (FileSync(open->primaryFile) < 0) primaryError = errno; } if (StorageManagerMirrorMode_SendToMirror(open->mirrorMode) && !open->mirrorDataLossOccurred) { if (FileRepPrimary_IsOperationCompleted( FileRep_GetRelationIdentifier( open->mirrorFilespaceLocation, open->relFileNode, open->segmentFileNum), FileRepRelationTypeBufferPool) == FALSE) { ereport(LOG, (errmsg("could not fsync file on mirror "), FileRep_ReportRelationPath( open->mirrorFilespaceLocation, open->relFileNode, open->segmentFileNum))); } else { //only include this stat if the fsync was successful if (fileRepRole == FileRepPrimaryRole) { FileRepGpmonStat_CloseRecord( FileRepGpmonStatType_PrimaryRoundtripFsyncMsg, &gpmonRecord); } } open->mirrorDataLossOccurred = FileRepPrimary_IsMirrorDataLossOccurred(); } errno = primaryError; return (errno == 0); }