Ejemplo n.º 1
0
/*
 * __wt_cond_signal --
 *	Signal a waiting thread.
 */
void
__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
{
	WT_DECL_RET;

	__wt_verbose(session, WT_VERB_MUTEX, "signal %s", cond->name);

	/*
	 * Our callers often set flags to cause a thread to exit. Add a barrier
	 * to ensure exit flags are seen by the sleeping threads, otherwise we
	 * can wake up a thread, it immediately goes back to sleep, and we'll
	 * hang. Use a full barrier (we may not write before waiting on thread
	 * join).
	 */
	WT_FULL_BARRIER();

	/*
	 * Fast path if we are in (or can enter), a state where the next waiter
	 * will return immediately as already signaled.
	 */
	if (cond->waiters == -1 ||
	    (cond->waiters == 0 && __wt_atomic_casi32(&cond->waiters, 0, -1)))
		return;

	EnterCriticalSection(&cond->mtx);
	WakeAllConditionVariable(&cond->cond);
	LeaveCriticalSection(&cond->mtx);
}
Ejemplo n.º 2
0
/*
 * __wt_yield --
 *	Yield the thread of control.
 */
void
__wt_yield(void)
{
	/*
	 * Yielding the processor isn't documented as a memory barrier, and it's
	 * a reasonable expectation to have. There's no reason not to explicitly
	 * include a barrier since we're giving up the CPU, and ensures callers
	 * aren't ever surprised.
	 */
	WT_FULL_BARRIER();

	SwitchToThread();
}
Ejemplo n.º 3
0
/*
 * __wt_las_set_written --
 *	Flag that the lookaside table has been written.
 */
void
__wt_las_set_written(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;

	conn = S2C(session);
	if (!conn->las_written) {
		conn->las_written = true;

		/*
		 * Push the flag: unnecessary, but from now page reads must deal
		 * with lookaside table records, and we only do the write once.
		 */
		WT_FULL_BARRIER();
	}
}
Ejemplo n.º 4
0
/*
 * __wt_lsm_start_worker --
 *	Start the worker thread for an LSM tree.
 */
static int
__lsm_tree_start_worker(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_CONNECTION *wt_conn;
	WT_LSM_WORKER_ARGS *wargs;
	WT_SESSION *wt_session;
	WT_SESSION_IMPL *s;
	uint32_t i;

	wt_conn = &S2C(session)->iface;

	WT_RET(wt_conn->open_session(wt_conn, NULL, NULL, &wt_session));
	lsm_tree->ckpt_session = (WT_SESSION_IMPL *)wt_session;
	F_SET(lsm_tree->ckpt_session, WT_SESSION_INTERNAL);

	F_SET(lsm_tree, WT_LSM_TREE_WORKING);
	/* The new thread will rely on the WORKING value being visible. */
	WT_FULL_BARRIER();
	if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
		for (i = 0; i < lsm_tree->merge_threads; i++) {
			WT_RET(wt_conn->open_session(
			    wt_conn, NULL, NULL, &wt_session));
			s = (WT_SESSION_IMPL *)wt_session;
			F_SET(s, WT_SESSION_INTERNAL);
			lsm_tree->worker_sessions[i] = s;

			WT_RET(__wt_calloc_def(session, 1, &wargs));
			wargs->lsm_tree = lsm_tree;
			wargs->id = i;
			WT_RET(__wt_thread_create(session,
			    &lsm_tree->worker_tids[i],
			    __wt_lsm_merge_worker, wargs));
		}
	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_NEWEST)) {
		WT_RET(wt_conn->open_session(wt_conn, NULL, NULL, &wt_session));
		lsm_tree->bloom_session = (WT_SESSION_IMPL *)wt_session;
		F_SET(lsm_tree->bloom_session, WT_SESSION_INTERNAL);

		WT_RET(__wt_thread_create(session,
		    &lsm_tree->bloom_tid, __wt_lsm_bloom_worker, lsm_tree));
	}
	WT_RET(__wt_thread_create(session,
	    &lsm_tree->ckpt_tid, __wt_lsm_checkpoint_worker, lsm_tree));

	return (0);
}
Ejemplo n.º 5
0
/*
 * __lsm_tree_start_worker --
 *	Start the worker thread for an LSM tree.
 */
static int
__lsm_tree_start_worker(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_CONNECTION *wt_conn;
	WT_LSM_WORKER_ARGS *wargs;
	WT_SESSION *wt_session;
	WT_SESSION_IMPL *s;
	uint32_t i;

	wt_conn = &S2C(session)->iface;

	/*
	 * All the LSM worker threads do their operations on read-only files.
	 * Use read-uncommitted isolation to avoid keeping updates in cache
	 * unnecessarily.
	 */
	WT_RET(wt_conn->open_session(
	    wt_conn, NULL, "isolation=read-uncommitted", &wt_session));
	lsm_tree->ckpt_session = (WT_SESSION_IMPL *)wt_session;
	F_SET(lsm_tree->ckpt_session, WT_SESSION_INTERNAL);

	F_SET(lsm_tree, WT_LSM_TREE_WORKING);
	/* The new thread will rely on the WORKING value being visible. */
	WT_FULL_BARRIER();
	if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
		for (i = 0; i < lsm_tree->merge_threads; i++) {
			WT_RET(wt_conn->open_session(
			    wt_conn, NULL, "isolation=read-uncommitted",
			    &wt_session));
			s = (WT_SESSION_IMPL *)wt_session;
			F_SET(s, WT_SESSION_INTERNAL);
			lsm_tree->worker_sessions[i] = s;

			WT_RET(__wt_calloc_def(session, 1, &wargs));
			wargs->lsm_tree = lsm_tree;
			wargs->id = i;
			WT_RET(__wt_thread_create(session,
			    &lsm_tree->worker_tids[i],
			    __wt_lsm_merge_worker, wargs));
		}
	WT_RET(__wt_thread_create(session,
	    &lsm_tree->ckpt_tid, __wt_lsm_checkpoint_worker, lsm_tree));

	return (0);
}
Ejemplo n.º 6
0
/*
 * __wt_thread_create --
 *	Create a new thread of control.
 */
int
__wt_thread_create(WT_SESSION_IMPL *session,
    wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg)
{
	/*
	 * Creating a thread isn't a memory barrier, but WiredTiger commonly
	 * sets flags and or state and then expects worker threads to start.
	 * Include a barrier to ensure safety in those cases.
	 */
	WT_FULL_BARRIER();

	/* Spawn a new thread of control. */
	tidret->id = (HANDLE)_beginthreadex(NULL, 0, func, arg, 0, NULL);
	if (tidret->id != 0) {
		tidret->created = true;
		return (0);
	}

	WT_RET_MSG(session, __wt_errno(), "thread create: _beginthreadex");
}
Ejemplo n.º 7
0
/*
 * __wt_thread_join --
 *	Wait for a thread of control to exit.
 */
int
__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t *tid)
{
	DWORD windows_error;

	/* Only attempt to join if thread was created successfully */
	if (!tid->created)
		return (0);
	tid->created = false;

	/*
	 * Joining a thread isn't a memory barrier, but WiredTiger commonly
	 * sets flags and or state and then expects worker threads to halt.
	 * Include a barrier to ensure safety in those cases.
	 */
	WT_FULL_BARRIER();

	if ((windows_error =
	    WaitForSingleObject(tid->id, INFINITE)) != WAIT_OBJECT_0) {
		if (windows_error == WAIT_FAILED)
			windows_error = __wt_getlasterror();
		__wt_errx(session, "thread join: WaitForSingleObject: %s",
		    __wt_formatmessage(session, windows_error));

		/* If we fail to wait, we will leak handles, do not continue. */
		return (WT_PANIC);
	}

	if (CloseHandle(tid->id) == 0) {
		windows_error = __wt_getlasterror();
		__wt_errx(session, "thread join: CloseHandle: %s",
		    __wt_formatmessage(session, windows_error));
		return (__wt_map_windows_error(windows_error));
	}

	return (0);
}
Ejemplo n.º 8
0
/*
 * __wt_spin_lock_unregister_lock --
 *	Remove a lock from the connection's list.
 */
void
__wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
{
	WT_CONNECTION_IMPL *conn;
	u_int i;

	conn = S2C(session);

	for (i = 0; i < WT_SPINLOCK_MAX; i++)
		if (conn->spinlock_list[i] == t)
			    conn->spinlock_list[i] = NULL;

	/*
	 * XXX
	 * The statistics thread reads through this array, there's a possible
	 * race: if that thread reads the pointer then goes to sleep, then we
	 * free the spinlock, then the statistics thread wakes up, it can read
	 * free'd memory.
	 *
	 * This is performance debugging code, so we're not fixing the race for
	 * now, minimize the window.
	 */
	WT_FULL_BARRIER();
}
Ejemplo n.º 9
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, int syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, leaf_bytes;
	uint64_t internal_pages, leaf_pages;
	uint32_t flags;
	bool evict_reset;

	btree = S2BT(session);

	flags = WT_READ_CACHE | WT_READ_NO_GEN;
	walk = NULL;
	txn = &session->txn;

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
		WT_RET(__wt_epoch(session, &start));

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write the hottest pages: checkpoint will have
			 * to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    __wt_txn_visible_all(
			    session, page->modify->update_txn)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * When internal pages are being reconciled by checkpoint their
		 * child pages cannot disappear from underneath them or be split
		 * into them, nor can underlying blocks be freed until the block
		 * lists for the checkpoint are stable.  Set the checkpointing
		 * flag to block eviction of dirty pages until the checkpoint's
		 * internal page pass is complete, then wait for any existing
		 * eviction to complete.
		 */
		btree->checkpointing = 1;
		WT_FULL_BARRIER();

		WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
		if (evict_reset)
			__wt_evict_file_exclusive_off(session);

		/* Write all dirty in-cache pages. */
		flags |= WT_READ_NO_EVICT;
		for (walk = NULL;;) {
			/*
			 * If we have a page, and it was ever modified, track
			 * the highest transaction ID in the tree.  We do this
			 * here because we want the value after reconciling
			 * dirty pages.
			 */
			if (walk != NULL && walk->page != NULL &&
			    (mod = walk->page->modify) != NULL &&
			    WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn))
				btree->rec_max_txn = mod->rec_max_txn;

			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			page = walk->page;
			mod = page->modify;

			/* Skip clean pages. */
			if (!__wt_page_is_modified(page))
				continue;

			/*
			 * Write dirty pages, unless we can be sure they only
			 * became dirty after the checkpoint started.
			 *
			 * We can skip dirty pages if:
			 * (1) they are leaf pages;
			 * (2) there is a snapshot transaction active (which
			 *     is the case in ordinary application checkpoints
			 *     but not all internal cases); and
			 * (3) the first dirty update on the page is
			 *     sufficiently recent that the checkpoint
			 *     transaction would skip them.
			 *
			 * Mark the tree dirty: the checkpoint marked it clean
			 * and we can't skip future checkpoints until this page
			 * is written.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
			    WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) &&
			    mod->rec_result != WT_PM_REC_REWRITE) {
				__wt_page_modify_set(session, page);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}
			WT_ERR(__wt_reconcile(session, walk, NULL, 0));
		}
		break;
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_ERR(__wt_epoch(session, &end));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of internal\n\t"
		    "Took: %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
		    WT_TIMEDIFF(end, start) / WT_MILLION));
	}

err:	/* On error, clear any left-over tree walk. */
	if (walk != NULL)
		WT_TRET(__wt_page_release(session, walk, flags));

	if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0)
		__wt_txn_release_snapshot(session);

	if (btree->checkpointing) {
		/*
		 * Update the checkpoint generation for this handle so visible
		 * updates newer than the checkpoint can be evicted.
		 *
		 * This has to be published before eviction is enabled again,
		 * so that eviction knows that the checkpoint has completed.
		 */
		WT_PUBLISH(btree->checkpoint_gen,
		    S2C(session)->txn_global.checkpoint_gen);
		WT_STAT_FAST_DATA_SET(session,
		    btree_checkpoint_generation, btree->checkpoint_gen);

		/*
		 * Clear the checkpoint flag and push the change; not required,
		 * but publishing the change means stalled eviction gets moving
		 * as soon as possible.
		 */
		btree->checkpointing = 0;
		WT_FULL_BARRIER();

		/*
		 * If this tree was being skipped by the eviction server during
		 * the checkpoint, clear the wait.
		 */
		btree->evict_walk_period = 0;

		/*
		 * Wake the eviction server, in case application threads have
		 * stalled while the eviction server decided it couldn't make
		 * progress.  Without this, application threads will be stalled
		 * until the eviction server next wakes.
		 */
		WT_TRET(__wt_evict_server_wake(session));
	}

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
		WT_RET(btree->bm->sync(btree->bm, session, true));

	return (ret);
}
Ejemplo n.º 10
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
	uint64_t oldest_id, saved_snap_min;
	uint32_t flags;

	conn = S2C(session);
	btree = S2BT(session);
	walk = NULL;
	txn = &session->txn;
	saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min;
	flags = WT_READ_CACHE | WT_READ_NO_GEN;

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
		WT_RET(__wt_epoch(session, &start));

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		/*
		 * Save the oldest transaction ID we need to keep around.
		 * Otherwise, in a busy system, we could be updating pages so
		 * fast that write leaves never catches up.  We deliberately
		 * have no transaction running at this point that would keep
		 * the oldest ID from moving forwards as we walk the tree.
		 */
		oldest_id = __wt_txn_oldest_id(session);

		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write hot pages (defined as pages that have
			 * been updated since the write phase leaves started):
			 * checkpoint will have to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * If we are flushing a file at read-committed isolation, which
		 * is of particular interest for flushing the metadata to make
		 * schema-changing operation durable, get a transactional
		 * snapshot now.
		 *
		 * All changes committed up to this point should be included.
		 * We don't update the snapshot in between pages because (a)
		 * the metadata shouldn't be that big, and (b) if we do ever
		 */
		if (txn->isolation == WT_ISO_READ_COMMITTED)
			__wt_txn_get_snapshot(session);

		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * In the final checkpoint pass, child pages cannot be evicted
		 * from underneath internal pages nor can underlying blocks be
		 * freed until the checkpoint's block lists are stable. Also,
		 * we cannot split child pages into parents unless we know the
		 * final pass will write a consistent view of that namespace.
		 * Set the checkpointing flag to block such actions and wait for
		 * any problematic eviction or page splits to complete.
		 */
		WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);

		WT_ERR(__wt_evict_file_exclusive_on(session));
		__wt_evict_file_exclusive_off(session);

		WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING);

		/* Write all dirty in-cache pages. */
		flags |= WT_READ_NO_EVICT;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/* Skip clean pages. */
			if (!__wt_page_is_modified(walk->page))
				continue;

			/*
			 * Take a local reference to the page modify structure
			 * now that we know the page is dirty. It needs to be
			 * done in this order otherwise the page modify
			 * structure could have been created between taking the
			 * reference and checking modified.
			 */
			page = walk->page;
			mod = page->modify;

			/*
			 * Write dirty pages, unless we can be sure they only
			 * became dirty after the checkpoint started.
			 *
			 * We can skip dirty pages if:
			 * (1) they are leaf pages;
			 * (2) there is a snapshot transaction active (which
			 *     is the case in ordinary application checkpoints
			 *     but not all internal cases); and
			 * (3) the first dirty update on the page is
			 *     sufficiently recent that the checkpoint
			 *     transaction would skip them.
			 *
			 * Mark the tree dirty: the checkpoint marked it clean
			 * and we can't skip future checkpoints until this page
			 * is written.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
			    WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
				__wt_page_modify_set(session, page);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}
			WT_ERR(__wt_reconcile(session, walk, NULL, 0));
		}
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
	WT_ILLEGAL_VALUE_ERR(session);
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_ERR(__wt_epoch(session, &end));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of internal\n\t"
		    "Took: %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
		    WT_TIMEDIFF_MS(end, start)));
	}

err:	/* On error, clear any left-over tree walk. */
	if (walk != NULL)
		WT_TRET(__wt_page_release(session, walk, flags));

	/*
	 * If we got a snapshot in order to write pages, and there was no
	 * snapshot active when we started, release it.
	 */
	if (txn->isolation == WT_ISO_READ_COMMITTED &&
	    saved_snap_min == WT_TXN_NONE)
		__wt_txn_release_snapshot(session);

	if (btree->checkpointing != WT_CKPT_OFF) {
		/*
		 * Update the checkpoint generation for this handle so visible
		 * updates newer than the checkpoint can be evicted.
		 *
		 * This has to be published before eviction is enabled again,
		 * so that eviction knows that the checkpoint has completed.
		 */
		WT_PUBLISH(btree->checkpoint_gen,
		    conn->txn_global.checkpoint_gen);
		WT_STAT_FAST_DATA_SET(session,
		    btree_checkpoint_generation, btree->checkpoint_gen);

		/*
		 * Clear the checkpoint flag and push the change; not required,
		 * but publishing the change means stalled eviction gets moving
		 * as soon as possible.
		 */
		btree->checkpointing = WT_CKPT_OFF;
		WT_FULL_BARRIER();

		/*
		 * If this tree was being skipped by the eviction server during
		 * the checkpoint, clear the wait.
		 */
		btree->evict_walk_period = 0;

		/*
		 * Wake the eviction server, in case application threads have
		 * stalled while the eviction server decided it couldn't make
		 * progress.  Without this, application threads will be stalled
		 * until the eviction server next wakes.
		 */
		WT_TRET(__wt_evict_server_wake(session));
	}

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 &&
	    syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
		WT_RET(btree->bm->sync(btree->bm, session, true));

	return (ret);
}
Ejemplo n.º 11
0
/*
 * __wt_hazard_set --
 *	Set a hazard pointer.
 */
int
__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_HAZARD *hp;
	int restarts = 0;

	btree = S2BT(session);
	conn = S2C(session);
	*busyp = false;

	/* If a file can never be evicted, hazard pointers aren't required. */
	if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
		return (0);

	/*
	 * Do the dance:
	 *
	 * The memory location which makes a page "real" is the WT_REF's state
	 * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the
	 * page eviction server.
	 *
	 * Add the WT_REF reference to the session's hazard list and flush the
	 * write, then see if the page's state is still valid.  If so, we can
	 * use the page because the page eviction server will see our hazard
	 * pointer before it discards the page (the eviction server sets the
	 * state to WT_REF_LOCKED, then flushes memory and checks the hazard
	 * pointers).
	 *
	 * For sessions with many active hazard pointers, skip most of the
	 * active slots: there may be a free slot in there, but checking is
	 * expensive.  Most hazard pointers are released quickly: optimize
	 * for that case.
	 */
	for (hp = session->hazard + session->nhazard;; ++hp) {
		/*
		 * If we get to the end of the array, either:
		 * 1. If we know there are free slots somewhere, and this is
		 *    the first time through, continue the search from the
		 *    start.  Don't actually continue the loop because that
		 *    will skip the first slot.
		 * 2. If we have searched all the way through and we have
		 *    allocated the maximum number of slots, give up.
		 * 3. Allocate another increment of slots, up to the maximum.
		 *    The slot we are on should now be available.
		 */
		if (hp >= session->hazard + session->hazard_size) {
			if (session->nhazard < session->hazard_size &&
			    restarts++ == 0)
				hp = session->hazard;
			else if (session->hazard_size >= conn->hazard_max)
				break;
			else
				WT_PUBLISH(session->hazard_size, WT_MIN(
				    session->hazard_size + WT_HAZARD_INCR,
				    conn->hazard_max));
		}

		if (hp->page != NULL)
			continue;

		hp->page = ref->page;
#ifdef HAVE_DIAGNOSTIC
		hp->file = file;
		hp->line = line;
#endif
		/* Publish the hazard pointer before reading page's state. */
		WT_FULL_BARRIER();

		/*
		 * Check if the page state is still valid, where valid means a
		 * state of WT_REF_MEM and the pointer is unchanged.  (The
		 * pointer can change, it means the page was evicted between
		 * the time we set our hazard pointer and the publication.  It
		 * would theoretically be possible for the page to be evicted
		 * and a different page read into the same memory, so the
		 * pointer hasn't changed but the contents have.  That's OK, we
		 * found this page using the tree's key space, whatever page we
		 * find here is the page for us to use.)
		 */
		if (ref->page == hp->page && ref->state == WT_REF_MEM) {
			++session->nhazard;
			return (0);
		}

		/*
		 * The page isn't available, it's being considered for eviction
		 * (or being evicted, for all we know).  If the eviction server
		 * sees our hazard pointer before evicting the page, it will
		 * return the page to use, no harm done, if it doesn't, it will
		 * go ahead and complete the eviction.
		 *
		 * We don't bother publishing this update: the worst case is we
		 * prevent some random page from being evicted.
		 */
		hp->page = NULL;
		*busyp = true;
		return (0);
	}

	__wt_errx(session,
	    "session %p: hazard pointer table full", (void *)session);
#ifdef HAVE_DIAGNOSTIC
	__hazard_dump(session);
#endif

	return (ENOMEM);
}
Ejemplo n.º 12
0
/*
 * __wt_statlog_dump_spinlock --
 *	Log the spin-lock statistics.
 */
int
__wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag)
{
	WT_SPINLOCK *spin;
	WT_CONNECTION_STATS_SPINLOCK *p, *t;
	uint64_t block_manager, btree_page, ignore;
	u_int i, j;

	/*
	 * Ignore rare acquisition of a spinlock using a base value of 10 per
	 * second so we don't create graphs we don't care about.
	 */
	ignore = (uint64_t)(conn->stat_usecs / 1000000) * 10;

	/* Output the number of times each spinlock was acquired. */
	block_manager = btree_page = 0;
	for (i = 0; i < WT_ELEMENTS(conn->spinlock_list); ++i) {
		if ((spin = conn->spinlock_list[i]) == NULL)
			continue;

		/*
		 * There are two sets of spinlocks we aggregate, the btree page
		 * locks and the block manager per-file locks.  The reason is
		 * the block manager locks grow with the number of files open
		 * (and LSM and bloom filters can open a lot of files), and
		 * there are 16 btree page locks and splitting them out has not
		 * historically been that informative.
		 */
		if (strcmp(spin->name, "block manager") == 0) {
			block_manager += spin->counter;
			if (conn->stat_clear)
				spin->counter = 0;
			continue;
		}
		if (strcmp(spin->name, "btree page") == 0) {
			btree_page += spin->counter;
			if (conn->stat_clear)
				spin->counter = 0;
			continue;
		}

		WT_RET_TEST((fprintf(conn->stat_fp,
		    "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
		    conn->stat_stamp,
		    spin->counter <= ignore ? 0 : spin->counter,
		    tag, spin->name) < 0),
		    __wt_errno());
		if (conn->stat_clear)
			spin->counter = 0;
	}
	WT_RET_TEST((fprintf(conn->stat_fp,
	    "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
	    conn->stat_stamp,
	    block_manager <= ignore ? 0 : block_manager,
	    tag, "block manager") < 0),
	    __wt_errno());
	WT_RET_TEST((fprintf(conn->stat_fp,
	    "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
	    conn->stat_stamp,
	    btree_page <= ignore ? 0 : btree_page,
	    tag, "btree page") < 0),
	    __wt_errno());

	/*
	 * Output the number of times each location acquires its spinlock and
	 * the blocking matrix.
	 */
	for (i = 0; i < WT_ELEMENTS(conn->spinlock_block); ++i) {
		p = &conn->spinlock_block[i];
		if (p->name == NULL)
			continue;

		WT_RET_TEST((fprintf(conn->stat_fp,
		    "%s %d %s spinlock %s acquired by %s(%d)\n",
		    conn->stat_stamp,
		    p->total <= ignore ? 0 : p->total,
		    tag,
		    p->name, p->file, p->line) < 0), __wt_errno());
		if (conn->stat_clear)
			p->total = 0;

		for (j = 0; j < WT_ELEMENTS(conn->spinlock_block); ++j) {
			t = &conn->spinlock_block[j];
			if (t->name == NULL)
				continue;

			WT_RET_TEST((fprintf(conn->stat_fp,
			    "%s %d %s spinlock %s: %s(%d) blocked by %s(%d)\n",
			    conn->stat_stamp,
			    p->blocked[j] <= ignore ? 0 : p->blocked[j],
			    tag,
			    p->name, p->file, p->line,
			    t->file, t->line) < 0), __wt_errno());
			if (conn->stat_clear)
				p->blocked[j] = 0;
		}
	}

	WT_FULL_BARRIER();			/* Minimize the window. */
	return (0);
}
Ejemplo n.º 13
0
/*
 * __wt_connection_close --
 *	Close a connection handle.
 */
int
__wt_connection_close(WT_CONNECTION_IMPL *conn)
{
	WT_CONNECTION *wt_conn;
	WT_DECL_RET;
	WT_DLH *dlh;
	WT_SESSION_IMPL *s, *session;
	u_int i;

	wt_conn = &conn->iface;
	session = conn->default_session;

	/* Shut down transactions (wait for in-flight operations to complete. */
	WT_TRET(__wt_txn_global_shutdown(session));

	/* Shut down the subsystems, ensuring workers see the state change. */
	F_SET(conn, WT_CONN_CLOSING);
	WT_FULL_BARRIER();

	/*
	 * Clear any pending async operations and shut down the async worker
	 * threads and system before closing LSM.
	 */
	WT_TRET(__wt_async_flush(session));
	WT_TRET(__wt_async_destroy(session));

	/*
	 * Shut down server threads other than the eviction server, which is
	 * needed later to close btree handles.  Some of these threads access
	 * btree handles, so take care in ordering shutdown to make sure they
	 * exit before files are closed.
	 */
	WT_TRET(__wt_lsm_manager_destroy(session));

	/*
	 * Once the async and LSM threads exit, we shouldn't be opening any
	 * more files.
	 */
	F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS);
	WT_FULL_BARRIER();

	WT_TRET(__wt_checkpoint_server_destroy(session));
	WT_TRET(__wt_statlog_destroy(session, true));
	WT_TRET(__wt_sweep_destroy(session));

	/* The eviction server is shut down last. */
	WT_TRET(__wt_evict_destroy(session));

	/* Shut down the lookaside table, after all eviction is complete. */
	WT_TRET(__wt_las_destroy(session));

	/* Close open data handles. */
	WT_TRET(__wt_conn_dhandle_discard(session));

	/* Shut down metadata tracking. */
	WT_TRET(__wt_meta_track_destroy(session));

	/*
	 * Now that all data handles are closed, tell logging that a checkpoint
	 * has completed then shut down the log manager (only after closing
	 * data handles).  The call to destroy the log manager is outside the
	 * conditional because we allocate the log path so that printlog can
	 * run without running logging or recovery.
	 */
	if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE))
		WT_TRET(__wt_txn_checkpoint_log(
		    session, true, WT_TXN_LOG_CKPT_STOP, NULL));
	WT_TRET(__wt_logmgr_destroy(session));

	/* Free memory for collators, compressors, data sources. */
	WT_TRET(__wt_conn_remove_collator(session));
	WT_TRET(__wt_conn_remove_compressor(session));
	WT_TRET(__wt_conn_remove_data_source(session));
	WT_TRET(__wt_conn_remove_encryptor(session));
	WT_TRET(__wt_conn_remove_extractor(session));

	/* Disconnect from shared cache - must be before cache destroy. */
	WT_TRET(__wt_conn_cache_pool_destroy(session));

	/* Discard the cache. */
	WT_TRET(__wt_cache_destroy(session));

	/* Discard transaction state. */
	__wt_txn_global_destroy(session);

	/* Close the lock file, opening up the database to other connections. */
	if (conn->lock_fh != NULL)
		WT_TRET(__wt_close(session, &conn->lock_fh));

	/* Close any file handles left open. */
	WT_TRET(__wt_close_connection_close(session));

	/*
	 * Close the internal (default) session, and switch back to the dummy
	 * session in case of any error messages from the remaining operations
	 * while destroying the connection handle.
	 */
	if (session != &conn->dummy_session) {
		WT_TRET(session->iface.close(&session->iface, NULL));
		session = conn->default_session = &conn->dummy_session;
	}

	/*
	 * The session split stash, hazard information and handle arrays aren't
	 * discarded during normal session close, they persist past the life of
	 * the session. Discard them now.
	 */
	if (!F_ISSET(conn, WT_CONN_LEAK_MEMORY))
		if ((s = conn->sessions) != NULL)
			for (i = 0; i < conn->session_size; ++s, ++i) {
				__wt_free(session, s->dhhash);
				__wt_stash_discard_all(session, s);
				__wt_free(session, s->hazard);
			}

	/* Destroy the file-system configuration. */
	if (conn->file_system != NULL && conn->file_system->terminate != NULL)
		WT_TRET(conn->file_system->terminate(
		    conn->file_system, (WT_SESSION *)session));

	/* Close extensions, first calling any unload entry point. */
	while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
		TAILQ_REMOVE(&conn->dlhqh, dlh, q);

		if (dlh->terminate != NULL)
			WT_TRET(dlh->terminate(wt_conn));
		WT_TRET(__wt_dlclose(session, dlh));
	}

	/* Destroy the handle. */
	__wt_connection_destroy(conn);

	return (ret);
}
Ejemplo n.º 14
0
/*
 * __wt_hazard_set --
 *	Set a hazard pointer.
 */
int
__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
#ifdef HAVE_DIAGNOSTIC
                , const char *file, int line
#endif
               )
{
    WT_BTREE *btree;
    WT_HAZARD *hp;
    int restarts = 0;

    btree = S2BT(session);
    *busyp = 0;

    /* If a file can never be evicted, hazard pointers aren't required. */
    if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
        return (0);

    /*
     * Do the dance:
     *
     * The memory location which makes a page "real" is the WT_REF's state
     * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the
     * page eviction server.
     *
     * Add the WT_REF reference to the session's hazard list and flush the
     * write, then see if the page's state is still valid.  If so, we can
     * use the page because the page eviction server will see our hazard
     * pointer before it discards the page (the eviction server sets the
     * state to WT_REF_LOCKED, then flushes memory and checks the hazard
     * pointers).
     *
     * For sessions with many active hazard pointers, skip most of the
     * active slots: there may be a free slot in there, but checking is
     * expensive.  Most hazard pointers are released quickly: optimize
     * for that case.
     */
    for (hp = session->hazard + session->nhazard;; ++hp) {
        /* Expand the number of hazard pointers if available.*/
        if (hp >= session->hazard + session->hazard_size) {
            if (session->hazard_size >= S2C(session)->hazard_max)
                break;
            /* Restart the search. */
            if (session->nhazard < session->hazard_size &&
                    restarts++ == 0) {
                hp = session->hazard;
                continue;
            }
            WT_PUBLISH(session->hazard_size,
                       WT_MIN(session->hazard_size + WT_HAZARD_INCR,
                              S2C(session)->hazard_max));
        }

        if (hp->page != NULL)
            continue;

        hp->page = ref->page;
#ifdef HAVE_DIAGNOSTIC
        hp->file = file;
        hp->line = line;
#endif
        /* Publish the hazard pointer before reading page's state. */
        WT_FULL_BARRIER();

        /*
         * Check if the page state is still valid, where valid means a
         * state of WT_REF_MEM or WT_REF_EVICT_WALK and the pointer is
         * unchanged.  (The pointer can change, it means the page was
         * evicted between the time we set our hazard pointer and the
         * publication.  It would theoretically be possible for the
         * page to be evicted and a different page read into the same
         * memory, so the pointer hasn't changed but the contents have.
         * That's OK, we found this page using the tree's key space,
         * whatever page we find here is the page for us to use.)
         */
        if (ref->page == hp->page &&
                (ref->state == WT_REF_MEM ||
                 ref->state == WT_REF_EVICT_WALK)) {
            WT_VERBOSE_RET(session, hazard,
                           "session %p hazard %p: set", session, ref->page);

            ++session->nhazard;
            return (0);
        }

        /*
         * The page isn't available, it's being considered for eviction
         * (or being evicted, for all we know).  If the eviction server
         * sees our hazard pointer before evicting the page, it will
         * return the page to use, no harm done, if it doesn't, it will
         * go ahead and complete the eviction.
         *
         * We don't bother publishing this update: the worst case is we
         * prevent some random page from being evicted.
         */
        hp->page = NULL;
        *busyp = 1;
        return (0);
    }

    __wt_errx(session,
              "session %p: hazard pointer table full", session);
#ifdef HAVE_DIAGNOSTIC
    __hazard_dump(session);
#endif

    return (ENOMEM);
}
Ejemplo n.º 15
0
/*
 * __log_file_server --
 *	The log file server thread.  This worker thread manages
 *	log file operations such as closing and syncing.
 */
static WT_THREAD_RET
__log_file_server(void *arg)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *close_fh;
	WT_LOG *log;
	WT_LSN close_end_lsn, min_lsn;
	WT_SESSION_IMPL *session;
	uint32_t filenum;
	bool locked;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	locked = false;
	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
		/*
		 * If there is a log file to close, make sure any outstanding
		 * write operations have completed, then fsync and close it.
		 */
		if ((close_fh = log->log_close_fh) != NULL) {
			WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
			    &filenum));
			/*
			 * We update the close file handle before updating the
			 * close LSN when changing files.  It is possible we
			 * could see mismatched settings.  If we do, yield
			 * until it is set.  This should rarely happen.
			 */
			while (log->log_close_lsn.l.file < filenum)
				__wt_yield();

			if (__wt_log_cmp(
			    &log->write_lsn, &log->log_close_lsn) >= 0) {
				/*
				 * We've copied the file handle, clear out the
				 * one in the log structure to allow it to be
				 * set again.  Copy the LSN before clearing
				 * the file handle.
				 * Use a barrier to make sure the compiler does
				 * not reorder the following two statements.
				 */
				close_end_lsn = log->log_close_lsn;
				WT_FULL_BARRIER();
				log->log_close_fh = NULL;
				/*
				 * Set the close_end_lsn to the LSN immediately
				 * after ours.  That is, the beginning of the
				 * next log file.   We need to know the LSN
				 * file number of our own close in case earlier
				 * calls are still in progress and the next one
				 * to move the sync_lsn into the next file for
				 * later syncs.
				 */
				WT_ERR(__wt_fsync(session, close_fh, true));

				/*
				 * We want to have the file size reflect actual
				 * data with minimal pre-allocated zeroed space.
				 * We can't truncate the file during hot backup,
				 * or the underlying file system may not support
				 * truncate: both are OK, it's just more work
				 * during cursor traversal.
				 */
				if (!conn->hot_backup) {
					__wt_readlock(
					    session, conn->hot_backup_lock);
					if (!conn->hot_backup)
						WT_ERR_ERROR_OK(
						    __wt_ftruncate(session,
						    close_fh,
						    close_end_lsn.l.offset),
						    ENOTSUP);
					__wt_readunlock(
					    session, conn->hot_backup_lock);
				}
				WT_SET_LSN(&close_end_lsn,
				    close_end_lsn.l.file + 1, 0);
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = true;
				WT_ERR(__wt_close(session, &close_fh));
				WT_ASSERT(session, __wt_log_cmp(
				    &close_end_lsn, &log->sync_lsn) >= 0);
				log->sync_lsn = close_end_lsn;
				__wt_cond_signal(session, log->log_sync_cond);
				locked = false;
				__wt_spin_unlock(session, &log->log_sync_lock);
			}
		}
		/*
		 * If a later thread asked for a background sync, do it now.
		 */
		if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
			/*
			 * Save the latest write LSN which is the minimum
			 * we will have written to disk.
			 */
			min_lsn = log->write_lsn;
			/*
			 * We have to wait until the LSN we asked for is
			 * written.  If it isn't signal the wrlsn thread
			 * to get it written.
			 *
			 * We also have to wait for the written LSN and the
			 * sync LSN to be in the same file so that we know we
			 * have synchronized all earlier log files.
			 */
			if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
				/*
				 * If the sync file is behind either the one
				 * wanted for a background sync or the write LSN
				 * has moved to another file continue to let
				 * this worker thread process that older file
				 * immediately.
				 */
				if ((log->sync_lsn.l.file <
				    log->bg_sync_lsn.l.file) ||
				    (log->sync_lsn.l.file < min_lsn.l.file))
					continue;
				WT_ERR(__wt_fsync(session, log->log_fh, true));
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = true;
				/*
				 * The sync LSN could have advanced while we
				 * were writing to disk.
				 */
				if (__wt_log_cmp(
				    &log->sync_lsn, &min_lsn) <= 0) {
					WT_ASSERT(session,
					    min_lsn.l.file ==
					    log->sync_lsn.l.file);
					log->sync_lsn = min_lsn;
					__wt_cond_signal(
					    session, log->log_sync_cond);
				}
				locked = false;
				__wt_spin_unlock(session, &log->log_sync_lock);
			} else {
				__wt_cond_auto_signal(
				    session, conn->log_wrlsn_cond);
				/*
				 * We do not want to wait potentially a second
				 * to process this.  Yield to give the wrlsn
				 * thread a chance to run and try again in
				 * this case.
				 */
				__wt_yield();
				continue;
			}
		}
		/* Wait until the next event. */
		__wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10);
	}

	if (0) {
err:		__wt_err(session, ret, "log close server error");
	}
	if (locked)
		__wt_spin_unlock(session, &log->log_sync_lock);
	return (WT_THREAD_RET_VALUE);
}
Ejemplo n.º 16
0
/*
 * __log_file_server --
 *	The log file server thread.  This worker thread manages
 *	log file operations such as closing and syncing.
 */
static WT_THREAD_RET
__log_file_server(void *arg)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *close_fh;
	WT_LOG *log;
	WT_LSN close_end_lsn, min_lsn;
	WT_SESSION_IMPL *session;
	uint32_t filenum;
	int locked;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	locked = 0;
	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
		/*
		 * If there is a log file to close, make sure any outstanding
		 * write operations have completed, then fsync and close it.
		 */
		if ((close_fh = log->log_close_fh) != NULL) {
			WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
			    &filenum));
			/*
			 * We update the close file handle before updating the
			 * close LSN when changing files.  It is possible we
			 * could see mismatched settings.  If we do, yield
			 * until it is set.  This should rarely happen.
			 */
			while (log->log_close_lsn.file < filenum)
				__wt_yield();

			if (__wt_log_cmp(
			    &log->write_lsn, &log->log_close_lsn) >= 0) {
				/*
				 * We've copied the file handle, clear out the
				 * one in the log structure to allow it to be
				 * set again.  Copy the LSN before clearing
				 * the file handle.
				 * Use a barrier to make sure the compiler does
				 * not reorder the following two statements.
				 */
				close_end_lsn = log->log_close_lsn;
				WT_FULL_BARRIER();
				log->log_close_fh = NULL;
				/*
				 * Set the close_end_lsn to the LSN immediately
				 * after ours.  That is, the beginning of the
				 * next log file.   We need to know the LSN
				 * file number of our own close in case earlier
				 * calls are still in progress and the next one
				 * to move the sync_lsn into the next file for
				 * later syncs.
				 */
				close_end_lsn.file++;
				close_end_lsn.offset = 0;
				WT_ERR(__wt_fsync(session, close_fh));
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = 1;
				WT_ERR(__wt_close(session, &close_fh));
				WT_ASSERT(session, __wt_log_cmp(
				    &close_end_lsn, &log->sync_lsn) >= 0);
				log->sync_lsn = close_end_lsn;
				WT_ERR(__wt_cond_signal(
				    session, log->log_sync_cond));
				locked = 0;
				__wt_spin_unlock(session, &log->log_sync_lock);
			}
		}
		/*
		 * If a later thread asked for a background sync, do it now.
		 */
		if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
			/*
			 * Save the latest write LSN which is the minimum
			 * we will have written to disk.
			 */
			min_lsn = log->write_lsn;
			/*
			 * We have to wait until the LSN we asked for is
			 * written.  If it isn't signal the wrlsn thread
			 * to get it written.
			 */
			if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
				WT_ERR(__wt_fsync(session, log->log_fh));
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = 1;
				/*
				 * The sync LSN could have advanced while we
				 * were writing to disk.
				 */
				if (__wt_log_cmp(
				    &log->sync_lsn, &min_lsn) <= 0) {
					log->sync_lsn = min_lsn;
					WT_ERR(__wt_cond_signal(
					    session, log->log_sync_cond));
				}
				locked = 0;
				__wt_spin_unlock(session, &log->log_sync_lock);
			} else {
				WT_ERR(__wt_cond_signal(
				    session, conn->log_wrlsn_cond));
				/*
				 * We do not want to wait potentially a second
				 * to process this.  Yield to give the wrlsn
				 * thread a chance to run and try again in
				 * this case.
				 */
				__wt_yield();
				continue;
			}
		}
		/* Wait until the next event. */
		WT_ERR(__wt_cond_wait(
		    session, conn->log_file_cond, WT_MILLION));
	}

	if (0) {
err:		__wt_err(session, ret, "log close server error");
	}
	if (locked)
		__wt_spin_unlock(session, &log->log_sync_lock);
	return (WT_THREAD_RET_VALUE);
}
Ejemplo n.º 17
0
/*
 * __wt_compact_page_skip --
 *	Return if compaction requires we read this page.
 */
int
__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	size_t addr_size;
	u_int type;
	const uint8_t *addr;

	/*
	 * Skip deleted pages, rewriting them doesn't seem useful; in a better
	 * world we'd write the parent to delete the page.
	 */
	if (ref->state == WT_REF_DELETED) {
		*skipp = true;
		return (0);
	}

	*skipp = false;				/* Default to reading */

	/*
	 * If the page is in-memory, we want to look at it (it may have been
	 * modified and written, and the current location is the interesting
	 * one in terms of compaction, not the original location).
	 *
	 * This test could be combined with the next one, but this is a cheap
	 * test and the next one is expensive.
	 */
	if (ref->state != WT_REF_DISK)
		return (0);

	/*
	 * There's nothing to prevent the WT_REF state from changing underfoot,
	 * which can change its address. For example, the WT_REF address might
	 * reference an on-page cell, and page eviction can free that memory.
	 * Lock the WT_REF so we can look at its address.
	 */
	if (!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * The page is on disk, so there had better be an address; assert that
	 * fact, test at run-time to avoid the core dump.
	 *
	 * Internal pages must be read to walk the tree; ask the block-manager
	 * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite
	 * won't help.
	 */
	__wt_ref_info(ref, &addr, &addr_size, &type);
	WT_ASSERT(session, addr != NULL);
	if (addr != NULL && type != WT_CELL_ADDR_INT) {
		bm = S2BT(session)->bm;
		ret = bm->compact_page_skip(
		    bm, session, addr, addr_size, skipp);
	}

	/*
	 * Reset the WT_REF state and push the change. The full-barrier isn't
	 * necessary, but it's better to keep pages in circulation than not.
	 */
	ref->state = WT_REF_DISK;
	WT_FULL_BARRIER();

	return (ret);
}