Exemple #1
0
/*
 * __log_server --
 *	The log server thread.
 */
static WT_THREAD_RET
__log_server(void *arg)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_SESSION_IMPL *session;
	int freq_per_sec, signalled;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	signalled = 0;

	/*
	 * Set this to the number of times per second we want to force out the
	 * log slot buffer.
	 */
#define	WT_FORCE_PER_SECOND	20
	freq_per_sec = WT_FORCE_PER_SECOND;

	/*
	 * The log server thread does a variety of work.  It forces out any
	 * buffered log writes.  It pre-allocates log files and it performs
	 * log archiving.  The reason the wrlsn thread does not force out
	 * the buffered writes is because we want to process and move the
	 * write_lsn forward as quickly as possible.  The same reason applies
	 * to why the log file server thread does not force out the writes.
	 * That thread does fsync calls which can take a long time and we
	 * don't want log records sitting in the buffer over the time it
	 * takes to sync out an earlier file.
	 */
	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
		/*
		 * Slots depend on future activity.  Force out buffered
		 * writes in case we are idle.  This cannot be part of the
		 * wrlsn thread because of interaction advancing the write_lsn
		 * and a buffer may need to wait for the write_lsn to advance
		 * in the case of a synchronous buffer.  We end up with a hang.
		 */
		WT_ERR_BUSY_OK(__wt_log_force_write(session, 0));

		/*
		 * We don't want to archive or pre-allocate files as often as
		 * we want to force out log buffers.  Only do it once per second
		 * or if the condition was signalled.
		 */
		if (--freq_per_sec <= 0 || signalled != 0) {
			freq_per_sec = WT_FORCE_PER_SECOND;

			/*
			 * Perform log pre-allocation.
			 */
			if (conn->log_prealloc > 0)
				WT_ERR(__log_prealloc_once(session));

			/*
			 * Perform the archive.
			 */
			if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) {
				if (__wt_try_writelock(
				    session, log->log_archive_lock) == 0) {
					ret = __log_archive_once(session, 0);
					WT_TRET(__wt_writeunlock(
					    session, log->log_archive_lock));
					WT_ERR(ret);
				} else
					WT_ERR(
					    __wt_verbose(session, WT_VERB_LOG,
					    "log_archive: Blocked due to open "
					    "log cursor holding archive lock"));
			}
		}

		/* Wait until the next event. */
		WT_ERR(__wt_cond_wait_signal(session, conn->log_cond,
		    WT_MILLION / WT_FORCE_PER_SECOND, &signalled));
	}

	if (0) {
err:		__wt_err(session, ret, "log server error");
	}
	return (WT_THREAD_RET_VALUE);
}
Exemple #2
0
/*
 * __log_server --
 *	The log server thread.
 */
static WT_THREAD_RET
__log_server(void *arg)
{
	struct timespec start, now;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_SESSION_IMPL *session;
	uint64_t timediff;
	bool did_work, locked, signalled;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	locked = signalled = false;

	/*
	 * Set this to the number of milliseconds we want to run archive and
	 * pre-allocation.  Start it so that we run on the first time through.
	 */
	timediff = WT_THOUSAND;

	/*
	 * The log server thread does a variety of work.  It forces out any
	 * buffered log writes.  It pre-allocates log files and it performs
	 * log archiving.  The reason the wrlsn thread does not force out
	 * the buffered writes is because we want to process and move the
	 * write_lsn forward as quickly as possible.  The same reason applies
	 * to why the log file server thread does not force out the writes.
	 * That thread does fsync calls which can take a long time and we
	 * don't want log records sitting in the buffer over the time it
	 * takes to sync out an earlier file.
	 */
	did_work = true;
	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
		/*
		 * Slots depend on future activity.  Force out buffered
		 * writes in case we are idle.  This cannot be part of the
		 * wrlsn thread because of interaction advancing the write_lsn
		 * and a buffer may need to wait for the write_lsn to advance
		 * in the case of a synchronous buffer.  We end up with a hang.
		 */
		WT_ERR_BUSY_OK(__wt_log_force_write(session, 0, &did_work));

		/*
		 * We don't want to archive or pre-allocate files as often as
		 * we want to force out log buffers.  Only do it once per second
		 * or if the condition was signalled.
		 */
		if (timediff >= WT_THOUSAND || signalled) {

			/*
			 * Perform log pre-allocation.
			 */
			if (conn->log_prealloc > 0) {
				/*
				 * Log file pre-allocation is disabled when a
				 * hot backup cursor is open because we have
				 * agreed not to rename or remove any files in
				 * the database directory.
				 */
				WT_ERR(__wt_readlock(
				    session, conn->hot_backup_lock));
				locked = true;
				if (!conn->hot_backup)
					WT_ERR(__log_prealloc_once(session));
				WT_ERR(__wt_readunlock(
				    session, conn->hot_backup_lock));
				locked = false;
			}

			/*
			 * Perform the archive.
			 */
			if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) {
				if (__wt_try_writelock(
				    session, log->log_archive_lock) == 0) {
					ret = __log_archive_once(session, 0);
					WT_TRET(__wt_writeunlock(
					    session, log->log_archive_lock));
					WT_ERR(ret);
				} else
					WT_ERR(
					    __wt_verbose(session, WT_VERB_LOG,
					    "log_archive: Blocked due to open "
					    "log cursor holding archive lock"));
			}
		}

		/* Wait until the next event. */

		WT_ERR(__wt_epoch(session, &start));
		WT_ERR(__wt_cond_auto_wait_signal(session, conn->log_cond,
		    did_work, &signalled));
		WT_ERR(__wt_epoch(session, &now));
		timediff = WT_TIMEDIFF_MS(now, start);
	}

	if (0) {
err:		__wt_err(session, ret, "log server error");
		if (locked)
			WT_TRET(__wt_readunlock(
			    session, conn->hot_backup_lock));
	}
	return (WT_THREAD_RET_VALUE);
}
Exemple #3
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *prev, *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
	uint64_t oldest_id, saved_pinned_id, time_start, time_stop;
	uint32_t flags;
	bool timer, tried_eviction;

	conn = S2C(session);
	btree = S2BT(session);
	prev = walk = NULL;
	txn = &session->txn;
	tried_eviction = false;
	time_start = time_stop = 0;

	/* Only visit pages in cache and don't bump page read generations. */
	flags = WT_READ_CACHE | WT_READ_NO_GEN;

	/*
	 * Skip all deleted pages.  For a page to be marked deleted, it must
	 * have been evicted from cache and marked clean.  Checkpoint should
	 * never instantiate deleted pages: if a truncate is not visible to the
	 * checkpoint, the on-disk version is correct.  If the truncate is
	 * visible, we skip over the child page when writing its parent.  We
	 * check whether a truncate is visible in the checkpoint as part of
	 * reconciling internal pages (specifically in __rec_child_modify).
	 */
	LF_SET(WT_READ_DELETED_SKIP);

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
	timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT);
	if (timer)
		time_start = __wt_clock(session);

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		/*
		 * Save the oldest transaction ID we need to keep around.
		 * Otherwise, in a busy system, we could be updating pages so
		 * fast that write leaves never catches up.  We deliberately
		 * have no transaction running at this point that would keep
		 * the oldest ID from moving forwards as we walk the tree.
		 */
		oldest_id = __wt_txn_oldest_id(session);

		LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL);
		for (;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write hot pages (defined as pages that have
			 * been updated since the write phase leaves started):
			 * checkpoint will have to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session,
				    walk, NULL, WT_REC_CHECKPOINT, NULL));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * If we are flushing a file at read-committed isolation, which
		 * is of particular interest for flushing the metadata to make
		 * a schema-changing operation durable, get a transactional
		 * snapshot now.
		 *
		 * All changes committed up to this point should be included.
		 * We don't update the snapshot in between pages because the
		 * metadata shouldn't have many pages.  Instead, read-committed
		 * isolation ensures that all metadata updates completed before
		 * the checkpoint are included.
		 */
		if (txn->isolation == WT_ISO_READ_COMMITTED)
			__wt_txn_get_snapshot(session);

		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * In the final checkpoint pass, child pages cannot be evicted
		 * from underneath internal pages nor can underlying blocks be
		 * freed until the checkpoint's block lists are stable. Also,
		 * we cannot split child pages into parents unless we know the
		 * final pass will write a consistent view of that namespace.
		 * Set the checkpointing flag to block such actions and wait for
		 * any problematic eviction or page splits to complete.
		 */
		WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF &&
		    btree->sync_session == NULL);

		btree->sync_session = session;
		btree->syncing = WT_BTREE_SYNC_WAIT;
		(void)__wt_gen_next_drain(session, WT_GEN_EVICT);
		btree->syncing = WT_BTREE_SYNC_RUNNING;

		/* Write all dirty in-cache pages. */
		LF_SET(WT_READ_NO_EVICT);

		/* Read pages with lookaside entries and evict them asap. */
		LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED);

		for (;;) {
			WT_ERR(__sync_dup_walk(session, walk, flags, &prev));
			WT_ERR(__wt_tree_walk(session, &walk, flags));

			if (walk == NULL)
				break;

			/*
			 * Skip clean pages, but need to make sure maximum
			 * transaction ID is always updated.
			 */
			if (!__wt_page_is_modified(walk->page)) {
				if (((mod = walk->page->modify) != NULL) &&
				    mod->rec_max_txn > btree->rec_max_txn)
					btree->rec_max_txn = mod->rec_max_txn;
				if (mod != NULL &&
				    btree->rec_max_timestamp <
				    mod->rec_max_timestamp)
					btree->rec_max_timestamp =
					    mod->rec_max_timestamp;
				continue;
			}

			/*
			 * Take a local reference to the page modify structure
			 * now that we know the page is dirty. It needs to be
			 * done in this order otherwise the page modify
			 * structure could have been created between taking the
			 * reference and checking modified.
			 */
			page = walk->page;

			/*
			 * Write dirty pages, if we can't skip them. If we skip
			 * a page, mark the tree dirty. The checkpoint marked it
			 * clean and we can't skip future checkpoints until this
			 * page is written.
			 */
			if (__sync_checkpoint_can_skip(session, page)) {
				__wt_tree_modify_set(session);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}

			/*
			 * If the page was pulled into cache by our read, try
			 * to evict it now.
			 *
			 * For eviction to have a chance, we first need to move
			 * the walk point to the next page checkpoint will
			 * visit.  We want to avoid this code being too special
			 * purpose, so try to reuse the ordinary eviction path.
			 *
			 * Regardless of whether eviction succeeds or fails,
			 * the walk continues from the previous location.  We
			 * remember whether we tried eviction, and don't try
			 * again.  Even if eviction fails (the page may stay in
			 * cache clean but with history that cannot be
			 * discarded), that is not wasted effort because
			 * checkpoint doesn't need to write the page again.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    page->read_gen == WT_READGEN_WONT_NEED &&
			    !tried_eviction) {
				WT_ERR_BUSY_OK(
				    __wt_page_release_evict(session, walk));
				walk = prev;
				prev = NULL;
				tried_eviction = true;
				continue;
			}
			tried_eviction = false;

			WT_ERR(__wt_reconcile(
			    session, walk, NULL, WT_REC_CHECKPOINT, NULL));

			/*
			 * Update checkpoint IO tracking data if configured
			 * to log verbose progress messages.
			 */
			if (conn->ckpt_timer_start.tv_sec > 0) {
				conn->ckpt_write_bytes +=
				    page->memory_footprint;
				++conn->ckpt_write_pages;

				/* Periodically log checkpoint progress. */
				if (conn->ckpt_write_pages % 5000 == 0)
					__wt_checkpoint_progress(
					    session, false);
			}
		}
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
		WT_ERR(__wt_illegal_value(session, syncop));
		break;
	}

	if (timer) {
		time_stop = __wt_clock(session);
		__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote: %" PRIu64
		    " leaf pages (%" PRIu64 "B), %" PRIu64
		    " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_pages, leaf_bytes, internal_pages, internal_bytes,
		    WT_CLOCKDIFF_MS(time_stop, time_start));
	}

err:	/* On error, clear any left-over tree walk. */
	WT_TRET(__wt_page_release(session, walk, flags));
	WT_TRET(__wt_page_release(session, prev, flags));

	/*
	 * If we got a snapshot in order to write pages, and there was no
	 * snapshot active when we started, release it.
	 */
	if (txn->isolation == WT_ISO_READ_COMMITTED &&
	    saved_pinned_id == WT_TXN_NONE)
		__wt_txn_release_snapshot(session);

	/* Clear the checkpoint flag. */
	btree->syncing = WT_BTREE_SYNC_OFF;
	btree->sync_session = NULL;

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 &&
	    syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
		WT_RET(btree->bm->sync(btree->bm, session, false));

	return (ret);
}
Exemple #4
0
/*
 * __wt_block_checkpoint_load --
 *	Load a checkpoint.
 */
int
__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
    const uint8_t *addr, size_t addr_size,
    uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
{
	WT_BLOCK_CKPT *ci, _ci;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint8_t *endp;

	WT_UNUSED(addr_size);
	ci = NULL;

	/*
	 * Sometimes we don't find a root page (we weren't given a checkpoint,
	 * or the checkpoint was empty).  In that case we return an empty root
	 * address, set that up now.
	 */
	*root_addr_sizep = 0;

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		if (addr != NULL) {
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(session, block, addr, tmp));
		}
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "%s: load-checkpoint: %s", block->name,
		    addr == NULL ? "[Empty]" : (const char *)tmp->data));
	}

	/*
	 * There's a single checkpoint in the file that can be written, all of
	 * the others are read-only.  We use the same initialization calls for
	 * readonly checkpoints, but the information doesn't persist.
	 */
	if (checkpoint) {
		ci = &_ci;
		WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
	} else {
		/*
		 * We depend on the btree level for locking: things will go
		 * bad fast should we open the live system in two handles, or
		 * if we create, salvage, truncate or verify the live/running
		 * file, for that matter.
		 */
		ci = &block->live;
		WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
	}

	/*
	 * If the checkpoint has an on-disk root page, load it.  Otherwise, size
	 * the file past the description information.
	 */
	if (addr == NULL || addr_size == 0)
		ci->file_size = block->allocsize;
	else {
		/* Crack the checkpoint cookie. */
		WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));

		/* Verify sets up next. */
		if (block->verify)
			WT_ERR(__wt_verify_ckpt_load(session, block, ci));

		/* Read any root page. */
		if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
			endp = root_addr;
			WT_ERR(__wt_block_addr_to_buffer(block, &endp,
			    ci->root_offset, ci->root_size, ci->root_cksum));
			*root_addr_sizep = WT_PTRDIFF(endp, root_addr);
		}

		/*
		 * Rolling a checkpoint forward requires the avail list, the
		 * blocks from which we can allocate.
		 */
		if (!checkpoint)
			WT_ERR(__wt_block_extlist_read_avail(
			    session, block, &ci->avail, ci->file_size));
	}

	/*
	 * If the checkpoint can be written, that means anything written after
	 * the checkpoint is no longer interesting, truncate the file.  Don't
	 * bother checking the avail list for a block at the end of the file,
	 * that was done when the checkpoint was first written (re-writing the
	 * checkpoint might possibly make it relevant here, but it's unlikely
	 * enough I don't bother).
	 */
	if (!checkpoint) {
		/*
		 * The truncate might fail if there's a file mapping (if there's
		 * an open checkpoint on the file), that's OK.
		 */
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size));
		WT_ERR_BUSY_OK(
		    __wt_block_truncate(session, block->fh, ci->file_size));
	}

	if (0) {
err:		/*
		 * Don't call checkpoint-unload: unload does real work including
		 * file truncation.  If we fail early enough that the checkpoint
		 * information isn't correct, bad things would happen.  The only
		 * allocated memory was in the service of verify, clean that up.
		 */
		if (block->verify)
			WT_TRET(__wt_verify_ckpt_unload(session, block));
	}

	/* Checkpoints don't need the original information, discard it. */
	if (checkpoint && ci != NULL)
		__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}