Exemplo n.º 1
0
/*
 *	mdimmedsync() -- Immediately sync a relation to stable storage.
 *
 * Note that only writes already issued are synced; this routine knows
 * nothing of dirty buffers that may exist inside the buffer manager.
 */
bool
mdimmedsync(SMgrRelation reln)
{
	MdfdVec    *v;
	BlockNumber curnblk;

	/*
	 * NOTE: mdnblocks makes sure we have opened all existing segments, so
	 * that fsync loop will get them all!
	 */
	curnblk = mdnblocks(reln);
	if (curnblk == InvalidBlockNumber)
		return false;			/* mdnblocks failed */

	v = mdopen(reln, false);

#ifndef LET_OS_MANAGE_FILESIZE
	while (v != NULL)
	{
		if (FileSync(v->mdfd_vfd) < 0)
			return false;
		v = v->mdfd_chain;
	}
#else
	if (FileSync(v->mdfd_vfd) < 0)
		return false;
#endif

	return true;
}
Exemplo n.º 2
0
/*
 * register_dirty_segment() -- Mark a relation segment as needing fsync
 *
 * If there is a local pending-ops table, just make an entry in it for
 * mdsync to process later.  Otherwise, try to pass off the fsync request
 * to the background writer process.  If that fails, just do the fsync
 * locally before returning (we expect this will not happen often enough
 * to be a performance problem).
 *
 * A false result implies I/O failure during local fsync.  errno will be
 * valid for error reporting.
 */
static bool
register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
{
	if (pendingOpsTable)
	{
		PendingOperationEntry entry;

		/* ensure any pad bytes in the struct are zeroed */
		MemSet(&entry, 0, sizeof(entry));
		entry.rnode = reln->smgr_rnode;
		entry.segno = seg->mdfd_segno;

		(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
		return true;
	}
	else
	{
		if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
			return true;
	}

	if (FileSync(seg->mdfd_vfd) < 0)
		return false;
	return true;
}
Exemplo n.º 3
0
int
FileTruncate(File file, int offset)
{
    int returnCode;

    DO_DB(printf("DEBUG: FileTruncate %d (%s)\n",
		 file, VfdCache[file].fileName));
    
    (void) FileSync(file);
    (void) FileAccess(file);
    returnCode = ftruncate(VfdCache[file].fd, offset);
    return(returnCode);
}
Exemplo n.º 4
0
/*
 * register_dirty_segment() -- Mark a relation segment as needing fsync
 *
 * If there is a local pending-ops table, just make an entry in it for
 * mdsync to process later.  Otherwise, try to pass off the fsync request
 * to the background writer process.  If that fails, just do the fsync
 * locally before returning (we expect this will not happen often enough
 * to be a performance problem).
 *
 * A false result implies I/O failure during local fsync.  errno will be
 * valid for error reporting.
 */
static bool
register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
{
	if (pendingOpsTable)
	{
		/* push it into local pending-ops table */
		RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
		return true;
	}
	else
	{
		if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
			return true;
	}

	if (FileSync(seg->mdfd_vfd) < 0)
		return false;
	return true;
}
Exemplo n.º 5
0
/*
 *	mdimmedsync() -- Immediately sync a relation to stable storage.
 *
 * Note that only writes already issued are synced; this routine knows
 * nothing of dirty buffers that may exist inside the buffer manager.
 */
void
mdimmedsync(SMgrRelation reln, ForkNumber forknum)
{
	MdfdVec    *v;
	BlockNumber curnblk;

	/*
	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
	 * fsync loop will get them all!
	 */
	curnblk = mdnblocks(reln, forknum);

	v = mdopen(reln, forknum, EXTENSION_FAIL);

	while (v != NULL)
	{
		if (FileSync(v->mdfd_vfd) < 0)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not fsync file \"%s\": %m",
							FilePathName(v->mdfd_vfd))));
		v = v->mdfd_chain;
	}
}
Exemplo n.º 6
0
/*
 *	mdsync() -- Sync previous writes to stable storage.
 */
void
mdsync(void)
{
	static bool mdsync_in_progress = false;

	HASH_SEQ_STATUS hstat;
	PendingOperationEntry *entry;
	int			absorb_counter;

	/*
	 * This is only called during checkpoints, and checkpoints should only
	 * occur in processes that have created a pendingOpsTable.
	 */
	if (!pendingOpsTable)
		elog(ERROR, "cannot sync without a pendingOpsTable");

	/*
	 * If we are in the bgwriter, the sync had better include all fsync
	 * requests that were queued by backends up to this point.	The tightest
	 * race condition that could occur is that a buffer that must be written
	 * and fsync'd for the checkpoint could have been dumped by a backend just
	 * before it was visited by BufferSync().  We know the backend will have
	 * queued an fsync request before clearing the buffer's dirtybit, so we
	 * are safe as long as we do an Absorb after completing BufferSync().
	 */
	AbsorbFsyncRequests();

	/*
	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
	 * checkpoint), we want to ignore fsync requests that are entered into the
	 * hashtable after this point --- they should be processed next time,
	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
	 * ones: new ones will have cycle_ctr equal to the incremented value of
	 * mdsync_cycle_ctr.
	 *
	 * In normal circumstances, all entries present in the table at this point
	 * will have cycle_ctr exactly equal to the current (about to be old)
	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
	 * fsync'ing loop, then older values of cycle_ctr might remain when we
	 * come back here to try again.  Repeated checkpoint failures would
	 * eventually wrap the counter around to the point where an old entry
	 * might appear new, causing us to skip it, possibly allowing a checkpoint
	 * to succeed that should not have.  To forestall wraparound, any time the
	 * previous mdsync() failed to complete, run through the table and
	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
	 *
	 * Think not to merge this loop with the main loop, as the problem is
	 * exactly that that loop may fail before having visited all the entries.
	 * From a performance point of view it doesn't matter anyway, as this path
	 * will never be taken in a system that's functioning normally.
	 */
	if (mdsync_in_progress)
	{
		/* prior try failed, so update any stale cycle_ctr values */
		hash_seq_init(&hstat, pendingOpsTable);
		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
		{
			entry->cycle_ctr = mdsync_cycle_ctr;
		}
	}

	/* Advance counter so that new hashtable entries are distinguishable */
	mdsync_cycle_ctr++;

	/* Set flag to detect failure if we don't reach the end of the loop */
	mdsync_in_progress = true;

	/* Now scan the hashtable for fsync requests to process */
	absorb_counter = FSYNCS_PER_ABSORB;
	hash_seq_init(&hstat, pendingOpsTable);
	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
	{
		/*
		 * If the entry is new then don't process it this time.  Note that
		 * "continue" bypasses the hash-remove call at the bottom of the loop.
		 */
		if (entry->cycle_ctr == mdsync_cycle_ctr)
			continue;

		/* Else assert we haven't missed it */
		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);

		/*
		 * If fsync is off then we don't have to bother opening the file at
		 * all.  (We delay checking until this point so that changing fsync on
		 * the fly behaves sensibly.)  Also, if the entry is marked canceled,
		 * fall through to delete it.
		 */
		if (enableFsync && !entry->canceled)
		{
			int			failures;

			/*
			 * If in bgwriter, we want to absorb pending requests every so
			 * often to prevent overflow of the fsync request queue.  It is
			 * unspecified whether newly-added entries will be visited by
			 * hash_seq_search, but we don't care since we don't need to
			 * process them anyway.
			 */
			if (--absorb_counter <= 0)
			{
				AbsorbFsyncRequests();
				absorb_counter = FSYNCS_PER_ABSORB;
			}

			/*
			 * The fsync table could contain requests to fsync segments that
			 * have been deleted (unlinked) by the time we get to them. Rather
			 * than just hoping an ENOENT (or EACCES on Windows) error can be
			 * ignored, what we do on error is absorb pending requests and
			 * then retry.	Since mdunlink() queues a "revoke" message before
			 * actually unlinking, the fsync request is guaranteed to be
			 * marked canceled after the absorb if it really was this case.
			 * DROP DATABASE likewise has to tell us to forget fsync requests
			 * before it starts deletions.
			 */
			for (failures = 0;; failures++)		/* loop exits at "break" */
			{
				SMgrRelation reln;
				MdfdVec    *seg;
				char	   *path;

				/*
				 * Find or create an smgr hash entry for this relation. This
				 * may seem a bit unclean -- md calling smgr?  But it's really
				 * the best solution.  It ensures that the open file reference
				 * isn't permanently leaked if we get an error here. (You may
				 * say "but an unreferenced SMgrRelation is still a leak!" Not
				 * really, because the only case in which a checkpoint is done
				 * by a process that isn't about to shut down is in the
				 * bgwriter, and it will periodically do smgrcloseall(). This
				 * fact justifies our not closing the reln in the success path
				 * either, which is a good thing since in non-bgwriter cases
				 * we couldn't safely do that.)  Furthermore, in many cases
				 * the relation will have been dirtied through this same smgr
				 * relation, and so we can save a file open/close cycle.
				 */
				reln = smgropen(entry->tag.rnode);

				/*
				 * It is possible that the relation has been dropped or
				 * truncated since the fsync request was entered.  Therefore,
				 * allow ENOENT, but only if we didn't fail already on this
				 * file.  This applies both during _mdfd_getseg() and during
				 * FileSync, since fd.c might have closed the file behind our
				 * back.
				 */
				seg = _mdfd_getseg(reln, entry->tag.forknum,
							  entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
								   false, EXTENSION_RETURN_NULL);
				if (seg != NULL &&
					FileSync(seg->mdfd_vfd) >= 0)
					break;		/* success; break out of retry loop */

				/*
				 * XXX is there any point in allowing more than one retry?
				 * Don't see one at the moment, but easy to change the test
				 * here if so.
				 */
				path = _mdfd_segpath(reln, entry->tag.forknum,
									 entry->tag.segno);
				if (!FILE_POSSIBLY_DELETED(errno) ||
					failures > 0)
					ereport(ERROR,
							(errcode_for_file_access(),
						   errmsg("could not fsync file \"%s\": %m", path)));
				else
					ereport(DEBUG1,
							(errcode_for_file_access(),
					   errmsg("could not fsync file \"%s\" but retrying: %m",
							  path)));
				pfree(path);

				/*
				 * Absorb incoming requests and check to see if canceled.
				 */
				AbsorbFsyncRequests();
				absorb_counter = FSYNCS_PER_ABSORB;		/* might as well... */

				if (entry->canceled)
					break;
			}					/* end retry loop */
		}

		/*
		 * If we get here, either we fsync'd successfully, or we don't have to
		 * because enableFsync is off, or the entry is (now) marked canceled.
		 * Okay to delete it.
		 */
		if (hash_search(pendingOpsTable, &entry->tag,
						HASH_REMOVE, NULL) == NULL)
			elog(ERROR, "pendingOpsTable corrupted");
	}							/* end loop over hashtable entries */

	/* Flag successful completion of mdsync */
	mdsync_in_progress = false;
}
Exemplo n.º 7
0
/*
 *	mdsync() -- Sync previous writes to stable storage.
 *
 * This is only called during checkpoints, and checkpoints should only
 * occur in processes that have created a pendingOpsTable.
 */
bool
mdsync(void)
{
	HASH_SEQ_STATUS hstat;
	PendingOperationEntry *entry;

	if (!pendingOpsTable)
		return false;

	/*
	 * If we are in the bgwriter, the sync had better include all fsync
	 * requests that were queued by backends before the checkpoint REDO point
	 * was determined.	We go that a little better by accepting all requests
	 * queued up to the point where we start fsync'ing.
	 */
	AbsorbFsyncRequests();

	hash_seq_init(&hstat, pendingOpsTable);
	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
	{
		/*
		 * If fsync is off then we don't have to bother opening the file at
		 * all.  (We delay checking until this point so that changing fsync on
		 * the fly behaves sensibly.)
		 */
		if (enableFsync)
		{
			SMgrRelation reln;
			MdfdVec    *seg;

			/*
			 * Find or create an smgr hash entry for this relation. This may
			 * seem a bit unclean -- md calling smgr?  But it's really the
			 * best solution.  It ensures that the open file reference isn't
			 * permanently leaked if we get an error here. (You may say "but
			 * an unreferenced SMgrRelation is still a leak!" Not really,
			 * because the only case in which a checkpoint is done by a
			 * process that isn't about to shut down is in the bgwriter, and
			 * it will periodically do smgrcloseall().	This fact justifies
			 * our not closing the reln in the success path either, which is a
			 * good thing since in non-bgwriter cases we couldn't safely do
			 * that.)  Furthermore, in many cases the relation will have been
			 * dirtied through this same smgr relation, and so we can save a
			 * file open/close cycle.
			 */
			reln = smgropen(entry->rnode);

			/*
			 * It is possible that the relation has been dropped or truncated
			 * since the fsync request was entered.  Therefore, we have to
			 * allow file-not-found errors.  This applies both during
			 * _mdfd_getseg() and during FileSync, since fd.c might have
			 * closed the file behind our back.
			 */
			seg = _mdfd_getseg(reln,
							   entry->segno * ((BlockNumber) RELSEG_SIZE),
							   true);
			if (seg)
			{
				if (FileSync(seg->mdfd_vfd) < 0 &&
					errno != ENOENT)
				{
					ereport(LOG,
							(errcode_for_file_access(),
							 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
									entry->segno,
									entry->rnode.spcNode,
									entry->rnode.dbNode,
									entry->rnode.relNode)));
					return false;
				}
			}
		}

		/* Okay, delete this entry */
		if (hash_search(pendingOpsTable, entry,
						HASH_REMOVE, NULL) == NULL)
			elog(ERROR, "pendingOpsTable corrupted");
	}

	return true;
}
Exemplo n.º 8
0
/*
 * Flush a flat file.
 *
 */
bool MirroredBufferPool_Flush(
	MirroredBufferPoolOpen *open)
				/* The open struct. */	

{
	int primaryError;
	FileRepGpmonRecord_s gpmonRecord;

	Assert(open != NULL);
	Assert(open->isActive);

	primaryError = 0;
	
	/*
	 * For Buffer Pool managed, we are normally not session oriented like Append-Only.
	 *
	 * Figure out mirroring each time...
	 */		
	MirroredBufferPool_RecheckMirrorAccess(open);

	if (StorageManagerMirrorMode_SendToMirror(open->mirrorMode) &&
		!open->mirrorDataLossOccurred)
	{
	    if (fileRepRole == FileRepPrimaryRole) 
		{
				FileRepGpmonStat_OpenRecord(
						FileRepGpmonStatType_PrimaryRoundtripFsyncMsg, 
						&gpmonRecord);
		}
		if (FileRepPrimary_MirrorFlush(
										FileRep_GetRelationIdentifier(
																	  open->mirrorFilespaceLocation,
																	  open->relFileNode, 
																	  open->segmentFileNum),
										FileRepRelationTypeBufferPool) != 0) 
		{
			if (Debug_filerep_print)
				ereport(LOG,
					(errmsg("could not sent file fsync request to mirror "), 
							FileRep_ReportRelationPath(
													   open->mirrorFilespaceLocation,
													   open->relFileNode,
													   open->segmentFileNum)));
		}
		
		open->mirrorDataLossOccurred =  FileRepPrimary_IsMirrorDataLossOccurred();
	}
	
	if (StorageManagerMirrorMode_DoPrimaryWork(open->mirrorMode) &&
		! FileRepResyncWorker_IsResyncRequest())	
	{
		errno = 0;

		if (FileSync(open->primaryFile) < 0) 
			primaryError = errno;
	}

	if (StorageManagerMirrorMode_SendToMirror(open->mirrorMode) &&
		!open->mirrorDataLossOccurred)
	{
		if (FileRepPrimary_IsOperationCompleted(
						FileRep_GetRelationIdentifier(
													  open->mirrorFilespaceLocation,
													  open->relFileNode, 
													  open->segmentFileNum),									
						FileRepRelationTypeBufferPool) == FALSE)	
		{
			ereport(LOG,
				(errmsg("could not fsync file on mirror "), 
					FileRep_ReportRelationPath(
								open->mirrorFilespaceLocation,
								open->relFileNode,
								open->segmentFileNum)));
		} else 
		{
				//only include this stat if the fsync was successful
				if (fileRepRole == FileRepPrimaryRole) 
				{
						FileRepGpmonStat_CloseRecord(
								FileRepGpmonStatType_PrimaryRoundtripFsyncMsg, 
								&gpmonRecord);
				}
		}
		open->mirrorDataLossOccurred = FileRepPrimary_IsMirrorDataLossOccurred();

	}
	
	errno = primaryError;
	return (errno == 0);

}