Ejemplo n.º 1
0
/*
 * __bm_sync --
 *	Flush a file to disk.
 */
static int
__bm_sync(WT_BM *bm, WT_SESSION_IMPL *session, int async)
{
	return (async ?
	    __wt_fsync_async(session, bm->block->fh) :
	    __wt_fsync(session, bm->block->fh));
}
Ejemplo n.º 2
0
Archivo: err.c Proyecto: Asamaha/mongo
/*
 * __handle_message_default --
 *	Default WT_EVENT_HANDLER->handle_message implementation: send to stdout.
 */
static int
__handle_message_default(WT_EVENT_HANDLER *handler,
    WT_SESSION *wt_session, const char *message)
{
	WT_SESSION_IMPL *session;

	WT_UNUSED(handler);

	session = (WT_SESSION_IMPL *)wt_session;
	WT_RET(__wt_fprintf(session, WT_STDOUT(session), "%s\n", message));
	WT_RET(__wt_fsync(session, WT_STDOUT(session), true));
	return (0);
}
Ejemplo n.º 3
0
Archivo: err.c Proyecto: Asamaha/mongo
/*
 * __handle_error_default --
 *	Default WT_EVENT_HANDLER->handle_error implementation: send to stderr.
 */
static int
__handle_error_default(WT_EVENT_HANDLER *handler,
    WT_SESSION *wt_session, int error, const char *errmsg)
{
	WT_SESSION_IMPL *session;

	WT_UNUSED(handler);
	WT_UNUSED(error);

	session = (WT_SESSION_IMPL *)wt_session;

	WT_RET(__wt_fprintf(session, WT_STDERR(session), "%s\n", errmsg));
	WT_RET(__wt_fsync(session, WT_STDERR(session), true));
	return (0);
}
Ejemplo n.º 4
0
/*
 * __wt_sync_and_rename_fh --
 *	Sync and close a file, and swap it into place.
 */
int
__wt_sync_and_rename_fh(
    WT_SESSION_IMPL *session, WT_FH **fhp, const char *from, const char *to)
{
	WT_DECL_RET;
	WT_FH *fh;

	fh = *fhp;
	*fhp = NULL;

	/* Flush to disk and close the handle. */
	ret = __wt_fsync(session, fh);
	WT_TRET(__wt_close(session, &fh));
	WT_RET(ret);

	/* Rename the source file to the target. */
	WT_RET(__wt_rename(session, from, to));

	/* Flush the backing directory to guarantee the rename. */
	return (__wt_directory_sync(session, NULL));
}
Ejemplo n.º 5
0
/*
 * __wt_block_checkpoint --
 *	Create a new checkpoint.
 */
int
__wt_block_checkpoint(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase)
{
	WT_BLOCK_CKPT *ci;

	ci = &block->live;
	ci->version = WT_BM_CHECKPOINT_VERSION;

	/*
	 * Write the root page: it's possible for there to be a checkpoint of
	 * an empty tree, in which case, we store an illegal root offset.
	 *
	 * XXX
	 * We happen to know that checkpoints are single-threaded above us in
	 * the btree engine.  That's probably something we want to guarantee
	 * for any WiredTiger block manager.
	 */
	if (buf == NULL) {
		ci->root_offset = WT_BLOCK_INVALID_OFFSET;
		ci->root_size = ci->root_cksum = 0;
	} else
		WT_RET(__wt_block_write_off(session, block, buf,
		    &ci->root_offset, &ci->root_size, &ci->root_cksum, 0));

	/* Process the checkpoint list, deleting and updating as required. */
	WT_RET(__ckpt_process(session, block, ckptbase));

	/*
	 * Checkpoints have to hit disk (it would be reasonable to configure for
	 * lazy checkpoints, but we don't support them yet).  Regardless, we're
	 * not holding any locks, other writers can proceed while we wait.
	 */
	if (F_ISSET(S2C(session), WT_CONN_SYNC))
		WT_RET(__wt_fsync(session, block->fh));

	return (0);
}
Ejemplo n.º 6
0
/*
 * __log_file_server --
 *	The log file server thread.  This worker thread manages
 *	log file operations such as closing and syncing.
 */
static WT_THREAD_RET
__log_file_server(void *arg)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *close_fh;
	WT_LOG *log;
	WT_LSN close_end_lsn, min_lsn;
	WT_SESSION_IMPL *session;
	uint32_t filenum;
	int locked;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	locked = 0;
	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
		/*
		 * If there is a log file to close, make sure any outstanding
		 * write operations have completed, then fsync and close it.
		 */
		if ((close_fh = log->log_close_fh) != NULL) {
			WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
			    &filenum));
			/*
			 * We update the close file handle before updating the
			 * close LSN when changing files.  It is possible we
			 * could see mismatched settings.  If we do, yield
			 * until it is set.  This should rarely happen.
			 */
			while (log->log_close_lsn.file < filenum)
				__wt_yield();

			if (__wt_log_cmp(
			    &log->write_lsn, &log->log_close_lsn) >= 0) {
				/*
				 * We've copied the file handle, clear out the
				 * one in the log structure to allow it to be
				 * set again.  Copy the LSN before clearing
				 * the file handle.
				 * Use a barrier to make sure the compiler does
				 * not reorder the following two statements.
				 */
				close_end_lsn = log->log_close_lsn;
				WT_FULL_BARRIER();
				log->log_close_fh = NULL;
				/*
				 * Set the close_end_lsn to the LSN immediately
				 * after ours.  That is, the beginning of the
				 * next log file.   We need to know the LSN
				 * file number of our own close in case earlier
				 * calls are still in progress and the next one
				 * to move the sync_lsn into the next file for
				 * later syncs.
				 */
				close_end_lsn.file++;
				close_end_lsn.offset = 0;
				WT_ERR(__wt_fsync(session, close_fh));
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = 1;
				WT_ERR(__wt_close(session, &close_fh));
				WT_ASSERT(session, __wt_log_cmp(
				    &close_end_lsn, &log->sync_lsn) >= 0);
				log->sync_lsn = close_end_lsn;
				WT_ERR(__wt_cond_signal(
				    session, log->log_sync_cond));
				locked = 0;
				__wt_spin_unlock(session, &log->log_sync_lock);
			}
		}
		/*
		 * If a later thread asked for a background sync, do it now.
		 */
		if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
			/*
			 * Save the latest write LSN which is the minimum
			 * we will have written to disk.
			 */
			min_lsn = log->write_lsn;
			/*
			 * We have to wait until the LSN we asked for is
			 * written.  If it isn't signal the wrlsn thread
			 * to get it written.
			 */
			if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
				WT_ERR(__wt_fsync(session, log->log_fh));
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = 1;
				/*
				 * The sync LSN could have advanced while we
				 * were writing to disk.
				 */
				if (__wt_log_cmp(
				    &log->sync_lsn, &min_lsn) <= 0) {
					log->sync_lsn = min_lsn;
					WT_ERR(__wt_cond_signal(
					    session, log->log_sync_cond));
				}
				locked = 0;
				__wt_spin_unlock(session, &log->log_sync_lock);
			} else {
				WT_ERR(__wt_cond_signal(
				    session, conn->log_wrlsn_cond));
				/*
				 * We do not want to wait potentially a second
				 * to process this.  Yield to give the wrlsn
				 * thread a chance to run and try again in
				 * this case.
				 */
				__wt_yield();
				continue;
			}
		}
		/* Wait until the next event. */
		WT_ERR(__wt_cond_wait(
		    session, conn->log_file_cond, WT_MILLION));
	}

	if (0) {
err:		__wt_err(session, ret, "log close server error");
	}
	if (locked)
		__wt_spin_unlock(session, &log->log_sync_lock);
	return (WT_THREAD_RET_VALUE);
}
Ejemplo n.º 7
0
/*
 * __log_file_server --
 *	The log file server thread.  This worker thread manages
 *	log file operations such as closing and syncing.
 */
static WT_THREAD_RET
__log_file_server(void *arg)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *close_fh;
	WT_LOG *log;
	WT_LSN close_end_lsn, close_lsn, min_lsn;
	WT_SESSION_IMPL *session;
	int locked;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	locked = 0;
	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
		/*
		 * If there is a log file to close, make sure any outstanding
		 * write operations have completed, then fsync and close it.
		 */
		if ((close_fh = log->log_close_fh) != NULL &&
		    (ret = __wt_log_extract_lognum(session, close_fh->name,
		    &close_lsn.file)) == 0 &&
		    close_lsn.file < log->write_lsn.file) {
			/*
			 * We've copied the file handle, clear out the one in
			 * log structure to allow it to be set again.
			 */
			log->log_close_fh = NULL;
			/*
			 * Set the close_end_lsn to the LSN immediately after
			 * ours.  That is, the beginning of the next log file.
			 * We need to know the LSN file number of our own close
			 * in case earlier calls are still in progress and the
			 * next one to move the sync_lsn into the next file for
			 * later syncs.
			 */
			close_lsn.offset = 0;
			close_end_lsn = close_lsn;
			close_end_lsn.file++;
			WT_ERR(__wt_fsync(session, close_fh));
			__wt_spin_lock(session, &log->log_sync_lock);
			locked = 1;
			WT_ERR(__wt_close(session, &close_fh));
			WT_ASSERT(session,
			    WT_LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0);
			log->sync_lsn = close_end_lsn;
			WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
			locked = 0;
			__wt_spin_unlock(session, &log->log_sync_lock);
		}
		/*
		 * If a later thread asked for a background sync, do it now.
		 */
		if (WT_LOG_CMP(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
			/*
			 * Save the latest write LSN which is the minimum
			 * we will have written to disk.
			 */
			min_lsn = log->write_lsn;
			/*
			 * The sync LSN we asked for better be smaller than
			 * the current written LSN.
			 */
			WT_ASSERT(session,
			    WT_LOG_CMP(&log->bg_sync_lsn, &min_lsn) <= 0);
			WT_ERR(__wt_fsync(session, log->log_fh));
			__wt_spin_lock(session, &log->log_sync_lock);
			locked = 1;
			/*
			 * The sync LSN could have advanced while we were
			 * writing to disk.
			 */
			if (WT_LOG_CMP(&log->sync_lsn, &min_lsn) <= 0) {
				log->sync_lsn = min_lsn;
				WT_ERR(__wt_cond_signal(
				    session, log->log_sync_cond));
			}
			locked = 0;
			__wt_spin_unlock(session, &log->log_sync_lock);
		}
		/* Wait until the next event. */
		WT_ERR(__wt_cond_wait(
		    session, conn->log_file_cond, WT_MILLION));
	}

	if (0) {
err:		__wt_err(session, ret, "log close server error");
	}
	if (locked)
		__wt_spin_unlock(session, &log->log_sync_lock);
	return (WT_THREAD_RET_VALUE);
}
Ejemplo n.º 8
0
/*
 * __wt_log_newfile --
 *	Create the next log file and write the file header record into it.
 */
int
__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOG_DESC *desc;
	WT_LOG_RECORD *logrec;
	WT_LOGSLOT tmp;
	WT_MYSLOT myslot;

	conn = S2C(session);
	log = conn->log;

	/*
	 * Set aside the log file handle to be closed later.  Other threads
	 * may still be using it to write to the log.
	 */
	WT_ASSERT(session, log->log_close_fh == NULL);
	log->log_close_fh = log->log_fh;
	log->fileid++;
	WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid));
	log->alloc_lsn.file = log->fileid;
	log->alloc_lsn.offset = log->log_fh->size;

	/*
	 * Set up the log descriptor record.  Use a scratch buffer to
	 * get correct alignment for direct I/O.
	 */
	WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
	WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
	memset(buf->mem, 0, log->allocsize);
	logrec = (WT_LOG_RECORD *)buf->mem;
	desc = (WT_LOG_DESC *)logrec->record;
	desc->log_magic = WT_LOG_MAGIC;
	desc->majorv = WT_LOG_MAJOR_VERSION;
	desc->minorv = WT_LOG_MINOR_VERSION;
	desc->log_size = (uint64_t)conn->log_file_max;

	/*
	 * Now that the record is set up, initialize the record header.
	 */
	logrec->len = log->allocsize;
	logrec->checksum = 0;
	logrec->checksum = __wt_cksum(logrec, log->allocsize);
	WT_CLEAR(tmp);
	myslot.slot = &tmp;
	myslot.offset = 0;

	/*
	 * Recursively call __log_acquire to allocate log space for the
	 * log descriptor record.  Call __log_fill to write it, but we
	 * do not need to call __log_release because we're not waiting for
	 * earlier operations to complete.
	 */
	WT_ERR(__log_acquire(session, logrec->len, &tmp));
	WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));

	/*
	 * If we're called from connection creation code, we need to update
	 * the LSNs since we're the only write in progress.
	 */
	if (conn_create) {
		WT_ERR(__wt_fsync(session, log->log_fh));
		log->sync_lsn = tmp.slot_end_lsn;
		log->write_lsn = tmp.slot_end_lsn;
	}

err:	__wt_scr_free(&buf);
	return (ret);
}
Ejemplo n.º 9
0
/*
 * __log_release --
 *	Release a log slot.
 */
static int
__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *close_fh;
	WT_LOG *log;
	WT_LSN sync_lsn;
	size_t write_size;
	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */

	conn = S2C(session);
	log = conn->log;
	/*
	 * If we're going to have to close our log file, make a local copy
	 * of the file handle structure.
	 */
	close_fh = NULL;
	if (F_ISSET(slot, SLOT_CLOSEFH)) {
		close_fh = log->log_close_fh;
		log->log_close_fh = NULL;
		F_CLR(slot, SLOT_CLOSEFH);
	}

	/* Write the buffered records */
	if (F_ISSET(slot, SLOT_BUFFERED)) {
		write_size = (size_t)
		    (slot->slot_end_lsn.offset - slot->slot_start_offset);
		WT_ERR(__wt_write(session, slot->slot_fh,
		    slot->slot_start_offset, write_size, slot->slot_buf.mem));
	}

	/*
	 * Wait for earlier groups to finish, otherwise there could be holes
	 * in the log file.
	 */
	while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0)
		__wt_yield();
	log->write_lsn = slot->slot_end_lsn;
	/*
	 * Try to consolidate calls to fsync to wait less.  Acquire a spin lock
	 * so that threads finishing writing to the log will wait while the
	 * current fsync completes and advance log->write_lsn.
	 */
	while (F_ISSET(slot, SLOT_SYNC) &&
	    LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
		if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
			(void)__wt_cond_wait(
			    session, log->log_sync_cond, 10000);
			continue;
		}
		/*
		 * Record the current end of log after we grabbed the lock.
		 * That is how far our fsync call with guarantee.
		 */
		sync_lsn = log->write_lsn;
		if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
			WT_STAT_FAST_CONN_INCR(session, log_sync);
			ret = __wt_fsync(session, log->log_fh);
			if (ret == 0) {
				F_CLR(slot, SLOT_SYNC);
				log->sync_lsn = sync_lsn;
				ret = __wt_cond_signal(
				    session, log->log_sync_cond);
			}
		}
		__wt_spin_unlock(session, &log->log_sync_lock);
		WT_ERR(ret);
	}
	if (F_ISSET(slot, SLOT_BUF_GROW)) {
		WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
		F_CLR(slot, SLOT_BUF_GROW);
		WT_STAT_FAST_CONN_INCRV(session,
		    log_buffer_size, slot->slot_buf.memsize);
		WT_ERR(__wt_buf_grow(session,
		    &slot->slot_buf, slot->slot_buf.memsize * 2));
	}
	/*
	 * If we have a file to close, close it now.
	 */
	if (close_fh)
		WT_ERR(__wt_close(session, close_fh));

err:	if (ret != 0 && slot->slot_error == 0)
		slot->slot_error = ret;
	return (ret);
}
Ejemplo n.º 10
0
/*
 * __log_file_server --
 *	The log file server thread.  This worker thread manages
 *	log file operations such as closing and syncing.
 */
static WT_THREAD_RET
__log_file_server(void *arg)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *close_fh;
	WT_LOG *log;
	WT_LSN close_end_lsn, min_lsn;
	WT_SESSION_IMPL *session;
	uint32_t filenum;
	bool locked;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	locked = false;
	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
		/*
		 * If there is a log file to close, make sure any outstanding
		 * write operations have completed, then fsync and close it.
		 */
		if ((close_fh = log->log_close_fh) != NULL) {
			WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
			    &filenum));
			/*
			 * We update the close file handle before updating the
			 * close LSN when changing files.  It is possible we
			 * could see mismatched settings.  If we do, yield
			 * until it is set.  This should rarely happen.
			 */
			while (log->log_close_lsn.l.file < filenum)
				__wt_yield();

			if (__wt_log_cmp(
			    &log->write_lsn, &log->log_close_lsn) >= 0) {
				/*
				 * We've copied the file handle, clear out the
				 * one in the log structure to allow it to be
				 * set again.  Copy the LSN before clearing
				 * the file handle.
				 * Use a barrier to make sure the compiler does
				 * not reorder the following two statements.
				 */
				close_end_lsn = log->log_close_lsn;
				WT_FULL_BARRIER();
				log->log_close_fh = NULL;
				/*
				 * Set the close_end_lsn to the LSN immediately
				 * after ours.  That is, the beginning of the
				 * next log file.   We need to know the LSN
				 * file number of our own close in case earlier
				 * calls are still in progress and the next one
				 * to move the sync_lsn into the next file for
				 * later syncs.
				 */
				WT_ERR(__wt_fsync(session, close_fh, true));

				/*
				 * We want to have the file size reflect actual
				 * data with minimal pre-allocated zeroed space.
				 * We can't truncate the file during hot backup,
				 * or the underlying file system may not support
				 * truncate: both are OK, it's just more work
				 * during cursor traversal.
				 */
				if (!conn->hot_backup) {
					__wt_readlock(
					    session, conn->hot_backup_lock);
					if (!conn->hot_backup)
						WT_ERR_ERROR_OK(
						    __wt_ftruncate(session,
						    close_fh,
						    close_end_lsn.l.offset),
						    ENOTSUP);
					__wt_readunlock(
					    session, conn->hot_backup_lock);
				}
				WT_SET_LSN(&close_end_lsn,
				    close_end_lsn.l.file + 1, 0);
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = true;
				WT_ERR(__wt_close(session, &close_fh));
				WT_ASSERT(session, __wt_log_cmp(
				    &close_end_lsn, &log->sync_lsn) >= 0);
				log->sync_lsn = close_end_lsn;
				__wt_cond_signal(session, log->log_sync_cond);
				locked = false;
				__wt_spin_unlock(session, &log->log_sync_lock);
			}
		}
		/*
		 * If a later thread asked for a background sync, do it now.
		 */
		if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
			/*
			 * Save the latest write LSN which is the minimum
			 * we will have written to disk.
			 */
			min_lsn = log->write_lsn;
			/*
			 * We have to wait until the LSN we asked for is
			 * written.  If it isn't signal the wrlsn thread
			 * to get it written.
			 *
			 * We also have to wait for the written LSN and the
			 * sync LSN to be in the same file so that we know we
			 * have synchronized all earlier log files.
			 */
			if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
				/*
				 * If the sync file is behind either the one
				 * wanted for a background sync or the write LSN
				 * has moved to another file continue to let
				 * this worker thread process that older file
				 * immediately.
				 */
				if ((log->sync_lsn.l.file <
				    log->bg_sync_lsn.l.file) ||
				    (log->sync_lsn.l.file < min_lsn.l.file))
					continue;
				WT_ERR(__wt_fsync(session, log->log_fh, true));
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = true;
				/*
				 * The sync LSN could have advanced while we
				 * were writing to disk.
				 */
				if (__wt_log_cmp(
				    &log->sync_lsn, &min_lsn) <= 0) {
					WT_ASSERT(session,
					    min_lsn.l.file ==
					    log->sync_lsn.l.file);
					log->sync_lsn = min_lsn;
					__wt_cond_signal(
					    session, log->log_sync_cond);
				}
				locked = false;
				__wt_spin_unlock(session, &log->log_sync_lock);
			} else {
				__wt_cond_auto_signal(
				    session, conn->log_wrlsn_cond);
				/*
				 * We do not want to wait potentially a second
				 * to process this.  Yield to give the wrlsn
				 * thread a chance to run and try again in
				 * this case.
				 */
				__wt_yield();
				continue;
			}
		}
		/* Wait until the next event. */
		__wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10);
	}

	if (0) {
err:		__wt_err(session, ret, "log close server error");
	}
	if (locked)
		__wt_spin_unlock(session, &log->log_sync_lock);
	return (WT_THREAD_RET_VALUE);
}
Ejemplo n.º 11
0
/*
 * __block_write_off --
 *	Write a buffer into a block, returning the block's offset, size and
 * checksum.
 */
static int
__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump,
    bool data_checksum, bool checkpoint_io, bool caller_locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	wt_off_t offset;
	size_t align_size;
	uint32_t checksum;
	bool local_locked;

	*offsetp = 0;			/* -Werror=maybe-uninitialized */
	*sizep = 0;			/* -Werror=maybe-uninitialized */
	*checksump = 0;			/* -Werror=maybe-uninitialized */

	fh = block->fh;

	/*
	 * Clear the block header to ensure all of it is initialized, even the
	 * unused fields.
	 */
	blk = WT_BLOCK_HEADER_REF(buf->mem);
	memset(blk, 0, sizeof(*blk));

	/* Buffers should be aligned for writing. */
	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL,
		    "direct I/O check: write buffer incorrectly allocated");
	}

	/*
	 * Align the size to an allocation unit.
	 *
	 * The buffer must be big enough for us to zero to the next allocsize
	 * boundary, this is one of the reasons the btree layer must find out
	 * from the block-manager layer the maximum size of the eventual write.
	 */
	align_size = WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer incorrectly allocated");
	}
	if (align_size > UINT32_MAX) {
		WT_ASSERT(session, align_size <= UINT32_MAX);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer too large to write");
	}

	/* Zero out any unused bytes at the end of the buffer. */
	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);

	/*
	 * Set the disk size so we don't have to incrementally read blocks
	 * during salvage.
	 */
	blk->disk_size = WT_STORE_SIZE(align_size);

	/*
	 * Update the block's checksum: if our caller specifies, checksum the
	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
	 * bytes.  The assumption is applications with good compression support
	 * turn off checksums and assume corrupted blocks won't decompress
	 * correctly.  However, if compression failed to shrink the block, the
	 * block wasn't compressed, in which case our caller will tell us to
	 * checksum the data to detect corruption. If compression succeeded,
	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
	 * because they're not compressed, both to give salvage a quick test
	 * of whether a block is useful and to give us a test so we don't lose
	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
	 *
	 * Checksum a little-endian version of the header, and write everything
	 * in little-endian format. The checksum is (potentially) returned in a
	 * big-endian format, swap it into place in a separate step.
	 */
	blk->flags = 0;
	if (data_checksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);
	blk->checksum = 0;
	__wt_block_header_byteswap(blk);
	blk->checksum = checksum = __wt_checksum(
	    buf->mem, data_checksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
#ifdef WORDS_BIGENDIAN
	blk->checksum = __wt_bswap32(blk->checksum);
#endif

	/* Pre-allocate some number of extension structures. */
	WT_RET(__wt_block_ext_prealloc(session, 5));

	/*
	 * Acquire a lock, if we don't already hold one.
	 * Allocate space for the write, and optionally extend the file (note
	 * the block-extend function may release the lock).
	 * Release any locally acquired lock.
	 */
	local_locked = false;
	if (!caller_locked) {
		__wt_spin_lock(session, &block->live_lock);
		local_locked = true;
	}
	ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);
	if (ret == 0)
		ret = __wt_block_extend(
		    session, block, fh, offset, align_size, &local_locked);
	if (local_locked)
		__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/* Write the block. */
	if ((ret =
	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
		if (!caller_locked)
			__wt_spin_lock(session, &block->live_lock);
		WT_TRET(__wt_block_off_free(
		    session, block, offset, (wt_off_t)align_size));
		if (!caller_locked)
			__wt_spin_unlock(session, &block->live_lock);
		WT_RET(ret);
	}

	/*
	 * Optionally schedule writes for dirty pages in the system buffer
	 * cache, but only if the current session can wait.
	 */
	if (block->os_cache_dirty_max != 0 &&
	    fh->written > block->os_cache_dirty_max &&
	    __wt_session_can_wait(session)) {
		fh->written = 0;
		if ((ret = __wt_fsync(session, fh, false)) != 0) {
			 /*
			  * Ignore ENOTSUP, but don't try again.
			  */
			if (ret != ENOTSUP)
				return (ret);
			block->os_cache_dirty_max = 0;
		}
	}

	/* Optionally discard blocks from the buffer cache. */
	WT_RET(__wt_block_discard(session, block, align_size));

	WT_STAT_CONN_INCR(session, block_write);
	WT_STAT_CONN_INCRV(session, block_byte_write, align_size);
	if (checkpoint_io)
		WT_STAT_CONN_INCRV(
		    session, block_byte_write_checkpoint, align_size);

	__wt_verbose(session, WT_VERB_WRITE,
	    "off %" PRIuMAX ", size %" PRIuMAX ", checksum %#" PRIx32,
	    (uintmax_t)offset, (uintmax_t)align_size, checksum);

	*offsetp = offset;
	*sizep = WT_STORE_SIZE(align_size);
	*checksump = checksum;

	return (0);
}
Ejemplo n.º 12
0
/*
 * __bm_sync --
 *	Flush a file to disk.
 */
static int
__bm_sync(WT_BM *bm, WT_SESSION_IMPL *session)
{
	return (__wt_fsync(session, bm->block->fh));
}