Example #1
0
/*
 * __rec_review --
 *	Get exclusive access to the page and review the page and its subtree
 *	for conditions that would block its eviction.
 *
 *	The ref and page arguments may appear to be redundant, because usually
 *	ref->page == page and page->ref == ref.  However, we need both because
 *	(a) there are cases where ref == NULL (e.g., for root page or during
 *	salvage), and (b) we can't safely look at page->ref until we have a
 *	hazard pointer.
 */
static int
__rec_review(WT_SESSION_IMPL *session,
    WT_REF *ref, WT_PAGE *page, int exclusive, int merge, int top)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE_MODIFY *mod;
	WT_PAGE *t;
	uint32_t i;

	btree = session->btree;

	/*
	 * Get exclusive access to the page if our caller doesn't have the tree
	 * locked down.
	 */
	if (!exclusive)
		WT_RET(__hazard_exclusive(session, ref, top));

	/*
	 * Recurse through the page's subtree: this happens first because we
	 * have to write pages in depth-first order, otherwise we'll dirty
	 * pages after we've written them.
	 */
	if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)
		WT_REF_FOREACH(page, ref, i)
			switch (ref->state) {
			case WT_REF_DISK:		/* On-disk */
			case WT_REF_DELETED:		/* On-disk, deleted */
				break;
			case WT_REF_MEM:		/* In-memory */
				WT_RET(__rec_review(session,
				    ref, ref->page, exclusive, merge, 0));
				break;
			case WT_REF_EVICT_WALK:		/* Walk point */
			case WT_REF_EVICT_FORCE:	/* Forced evict */
			case WT_REF_LOCKED:		/* Being evicted */
			case WT_REF_READING:		/* Being read */
				return (EBUSY);
			}

	/*
	 * If the file is being checkpointed, we cannot evict dirty pages,
	 * because that may free a page that appears on an internal page in the
	 * checkpoint.  Don't rely on new updates being skipped by the
	 * transaction used for transaction reads: (1) there are paths that
	 * dirty pages for artificial reasons; (2) internal pages aren't
	 * transactional; and (3) if an update was skipped during the
	 * checkpoint (leaving the page dirty), then rolled back, we could
	 * still successfully overwrite a page and corrupt the checkpoint.
	 *
	 * Further, even for clean pages, the checkpoint's reconciliation of an
	 * internal page might race with us as we evict a child in the page's
	 * subtree.
	 *
	 * One half of that test is in the reconciliation code: the checkpoint
	 * thread waits for eviction-locked pages to settle before determining
	 * their status.  The other half of the test is here: after acquiring
	 * the exclusive eviction lock on a page, confirm no page in the page's
	 * stack of pages from the root is being reconciled in a checkpoint.
	 * This ensures we either see the checkpoint-walk state here, or the
	 * reconciliation of the internal page sees our exclusive lock on the
	 * child page and waits until we're finished evicting the child page
	 * (or give up if eviction isn't possible).
	 *
	 * We must check the full stack (we might be attempting to evict a leaf
	 * page multiple levels beneath the internal page being reconciled as
	 * part of the checkpoint, and  all of the intermediate nodes are being
	 * merged into the internal page).
	 *
	 * There's no simple test for knowing if a page in our page stack is
	 * involved in a checkpoint.  The internal page's checkpoint-walk flag
	 * is the best test, but it's not set anywhere for the root page, it's
	 * not a complete test.
	 *
	 * Quit for any page that's not a simple, in-memory page.  (Almost the
	 * same as checking for the checkpoint-walk flag.  I don't think there
	 * are code paths that change the page's status from checkpoint-walk,
	 * but these races are hard enough I'm not going to proceed if there's
	 * anything other than a vanilla, in-memory tree stack.)  Climb until
	 * we find a page which can't be merged into its parent, and failing if
	 * we never find such a page.
	 */
	if (btree->checkpointing && !merge && __wt_page_is_modified(page)) {
ckpt:		WT_CSTAT_INCR(session, cache_eviction_checkpoint);
		WT_DSTAT_INCR(session, cache_eviction_checkpoint);
		return (EBUSY);
	}

	if (btree->checkpointing && top)
		for (t = page->parent;; t = t->parent) {
			if (t == NULL || t->ref == NULL)	/* root */
				goto ckpt;
			if (t->ref->state != WT_REF_MEM)	/* scary */
				goto ckpt;
			if (t->modify == NULL ||		/* not merged */
			    !F_ISSET(t->modify, WT_PM_REC_EMPTY |
			    WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))
				break;
		}

	/*
	 * If we are merging internal pages, we just need exclusive access, we
	 * don't need to write everything.
	 */
	if (merge)
		return (0);

	/*
	 * Fail if any page in the top-level page's subtree won't be merged into
	 * its parent, the page that cannot be merged must be evicted first.
	 * The test is necessary but should not fire much: the eviction code is
	 * biased for leaf pages, an internal page shouldn't be selected for
	 * eviction until its children have been evicted.
	 *
	 * We have to write dirty pages to know their final state, a page marked
	 * empty may have had records added since reconciliation, a page marked
	 * split may have had records deleted and no longer need to split.
	 * Split-merge pages are the exception: they can never be change into
	 * anything other than a split-merge page and are merged regardless of
	 * being clean or dirty.
	 *
	 * Writing the page is expensive, do a cheap test first: if it doesn't
	 * appear a subtree page can be merged, quit.  It's possible the page
	 * has been emptied since it was last reconciled, and writing it before
	 * testing might be worthwhile, but it's more probable we're attempting
	 * to evict an internal page with live children, and that's a waste of
	 * time.
	 */
	mod = page->modify;
	if (!top && (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)))
		return (EBUSY);

	/*
	 * If the page is dirty and can possibly change state, write it so we
	 * know the final state.
	 */
	if (__wt_page_is_modified(page) &&
	    !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) {
		ret = __wt_rec_write(session, page,
		    NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT);

		/*
		 * Update the page's modification reference, reconciliation
		 * might have changed it.
		 */
		mod = page->modify;

		/* If there are unwritten changes on the page, give up. */
		if (ret == EBUSY) {
			WT_VERBOSE_RET(session, evict,
			    "eviction failed, reconciled page not clean");

			/* 
			 * We may be able to discard any "update" memory the
			 * page no longer needs.
			 */
			switch (page->type) {
			case WT_PAGE_COL_FIX:
			case WT_PAGE_COL_VAR:
				__wt_col_leaf_obsolete(session, page);
				break;
			case WT_PAGE_ROW_LEAF:
				__wt_row_leaf_obsolete(session, page);
				break;
			}
		}
		WT_RET(ret);

		WT_ASSERT(session, __wt_page_is_modified(page) == 0);
	}

	/*
	 * Repeat the test: fail if any page in the top-level page's subtree
	 * won't be merged into its parent.
	 */
	if (!top && (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)))
		return (EBUSY);
	return (0);
}
Example #2
0
/*
 * __wt_compact --
 *	Compact a file.
 */
int
__wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_BM *bm;
    WT_BTREE *btree;
    WT_DECL_RET;
    WT_REF *ref;
    bool block_manager_begin, skip;

    WT_UNUSED(cfg);

    btree = S2BT(session);
    bm = btree->bm;
    ref = NULL;
    block_manager_begin = false;

    WT_STAT_FAST_DATA_INCR(session, session_compact);

    /*
     * Check if compaction might be useful -- the API layer will quit trying
     * to compact the data source if we make no progress, set a flag if the
     * block layer thinks compaction is possible.
     */
    WT_RET(bm->compact_skip(bm, session, &skip));
    if (skip)
        return (0);

    /*
     * Reviewing in-memory pages requires looking at page reconciliation
     * results, because we care about where the page is stored now, not
     * where the page was stored when we first read it into the cache.
     * We need to ensure we don't race with page reconciliation as it's
     * writing the page modify information.
     *
     * There are three ways we call reconciliation: checkpoints, threads
     * writing leaf pages (usually in preparation for a checkpoint or if
     * closing a file), and eviction.
     *
     * We're holding the schema lock which serializes with checkpoints.
     */
    WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));

    /*
     * Get the tree handle's flush lock which blocks threads writing leaf
     * pages.
     */
    __wt_spin_lock(session, &btree->flush_lock);

    /* Start compaction. */
    WT_ERR(bm->compact_start(bm, session));
    block_manager_begin = true;

    /* Walk the tree reviewing pages to see if they should be re-written. */
    for (;;) {
        /*
         * Pages read for compaction aren't "useful"; don't update the
         * read generation of pages already in memory, and if a page is
         * read, set its generation to a low value so it is evicted
         * quickly.
         */
        WT_ERR(__wt_tree_walk(session, &ref,
                              WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED));
        if (ref == NULL)
            break;

        WT_ERR(__compact_rewrite(session, ref, &skip));
        if (skip)
            continue;

        session->compact_state = WT_COMPACT_SUCCESS;

        /* Rewrite the page: mark the page and tree dirty. */
        WT_ERR(__wt_page_modify_init(session, ref->page));
        __wt_page_modify_set(session, ref->page);

        WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite);
    }

err:
    if (ref != NULL)
        WT_TRET(__wt_page_release(session, ref, 0));

    if (block_manager_begin)
        WT_TRET(bm->compact_end(bm, session));

    /* Unblock threads writing leaf pages. */
    __wt_spin_unlock(session, &btree->flush_lock);

    return (ret);
}
Example #3
0
/*
 * __wt_tree_walk --
 *	Move to the next/previous page in the tree.
 */
int
__wt_tree_walk(WT_SESSION_IMPL *session,
    WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_INDEX *pindex;
	WT_REF *couple, *couple_orig, *ref;
	int prev, skip;
	uint32_t slot;

	btree = S2BT(session);

	/*
	 * Tree walks are special: they look inside page structures that splits
	 * may want to free.  Publish that the tree is active during this
	 * window.
	 */
	WT_ENTER_PAGE_INDEX(session);

	/*
	 * !!!
	 * Fast-truncate currently only works on row-store trees.
	 */
	if (btree->type != BTREE_ROW)
		LF_CLR(WT_READ_TRUNCATE);

	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;

	/*
	 * There are multiple reasons and approaches to walking the in-memory
	 * tree:
	 *
	 * (1) finding pages to evict (the eviction server);
	 * (2) writing just dirty leaves or internal nodes (checkpoint);
	 * (3) discarding pages (close);
	 * (4) truncating pages in a range (fast truncate);
	 * (5) skipping pages based on outside information (compaction);
	 * (6) cursor scans (applications).
	 *
	 * Except for cursor scans and compaction, the walk is limited to the
	 * cache, no pages are read.  In all cases, hazard pointers protect the
	 * walked pages from eviction.
	 *
	 * Walks use hazard-pointer coupling through the tree and that's OK
	 * (hazard pointers can't deadlock, so there's none of the usual
	 * problems found when logically locking up a btree).  If the eviction
	 * thread tries to evict the active page, it fails because of our
	 * hazard pointer.  If eviction tries to evict our parent, that fails
	 * because the parent has a child page that can't be discarded.  We do
	 * play one game: don't couple up to our parent and then back down to a
	 * new leaf, couple to the next page to which we're descending, it
	 * saves a hazard-pointer swap for each cursor page movement.
	 *
	 * !!!
	 * NOTE: we depend on the fact it's OK to release a page we don't hold,
	 * that is, it's OK to release couple when couple is set to NULL.
	 *
	 * Take a copy of any held page and clear the return value.  Remember
	 * the hazard pointer we're currently holding.
	 *
	 * We may be passed a pointer to btree->evict_page that we are clearing
	 * here.  We check when discarding pages that we're not discarding that
	 * page, so this clear must be done before the page is released.
	 */
	couple = couple_orig = ref = *refp;
	*refp = NULL;

	/* If no page is active, begin a walk from the start of the tree. */
	if (ref == NULL) {
		ref = &btree->root;
		if (ref->page == NULL)
			goto done;
		goto descend;
	}

ascend:	/*
	 * If the active page was the root, we've reached the walk's end.
	 * Release any hazard-pointer we're holding.
	 */
	if (__wt_ref_is_root(ref)) {
		WT_ERR(__wt_page_release(session, couple, flags));
		goto done;
	}

	/* Figure out the current slot in the WT_REF array. */
	__wt_page_refp(session, ref, &pindex, &slot);

	for (;;) {
		/*
		 * If we're at the last/first slot on the page, return this page
		 * in post-order traversal.  Otherwise we move to the next/prev
		 * slot and left/right-most element in its subtree.
		 */
		if ((prev && slot == 0) ||
		    (!prev && slot == pindex->entries - 1)) {
			ref = ref->home->pg_intl_parent_ref;

			/* Optionally skip internal pages. */
			if (LF_ISSET(WT_READ_SKIP_INTL))
				goto ascend;

			/*
			 * We've ascended the tree and are returning an internal
			 * page.  If it's the root, discard our hazard pointer,
			 * otherwise, swap our hazard pointer for the page we'll
			 * return.
			 */
			if (__wt_ref_is_root(ref))
				WT_ERR(__wt_page_release(
				    session, couple, flags));
			else {
				/*
				 * Locate the reference to our parent page then
				 * swap our child hazard pointer for the parent.
				 * We don't handle restart or not-found returns.
				 * It would require additional complexity and is
				 * not a possible return: we're moving to the
				 * parent of the current child page, our parent
				 * reference can't have split or been evicted.
				 */
				__wt_page_refp(session, ref, &pindex, &slot);
				if ((ret = __wt_page_swap(
				    session, couple, ref, flags)) != 0) {
					WT_TRET(__wt_page_release(
					    session, couple, flags));
					WT_ERR(ret);
				}
			}

			*refp = ref;
			goto done;
		}

		if (prev)
			--slot;
		else
			++slot;

		if (walkcntp != NULL)
			++*walkcntp;

		for (;;) {
			ref = pindex->index[slot];

			if (LF_ISSET(WT_READ_CACHE)) {
				/*
				 * Only look at unlocked pages in memory:
				 * fast-path some common cases.
				 */
				if (LF_ISSET(WT_READ_NO_WAIT) &&
				    ref->state != WT_REF_MEM)
					break;
			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
				/*
				 * Avoid pulling a deleted page back in to try
				 * to delete it again.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref))
					break;
				/*
				 * If deleting a range, try to delete the page
				 * without instantiating it.
				 */
				WT_ERR(__wt_delete_page(session, ref, &skip));
				if (skip)
					break;
			} else if (LF_ISSET(WT_READ_COMPACT)) {
				/*
				 * Skip deleted pages, rewriting them doesn't
				 * seem useful.
				 */
				if (ref->state == WT_REF_DELETED)
					break;

				/*
				 * If the page is in-memory, we want to look at
				 * it (it may have been modified and written,
				 * and the current location is the interesting
				 * one in terms of compaction, not the original
				 * location).  If the page isn't in-memory, test
				 * if the page will help with compaction, don't
				 * read it if we don't have to.
				 */
				if (ref->state == WT_REF_DISK) {
					WT_ERR(__wt_compact_page_skip(
					    session, ref, &skip));
					if (skip)
						break;
				}
			} else {
				/*
				 * Try to skip deleted pages visible to us.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref))
					break;
			}

			ret = __wt_page_swap(session, couple, ref, flags);

			/*
			 * Not-found is an expected return when only walking
			 * in-cache pages.
			 */
			if (ret == WT_NOTFOUND) {
				ret = 0;
				break;
			}

			/*
			 * The page we're moving to might have split, in which
			 * case move to the last position we held.
			 */
			if (ret == WT_RESTART) {
				ret = 0;

				/*
				 * If a new walk that never coupled from the
				 * root to a new saved position in the tree,
				 * restart the walk.
				 */
				if (couple == &btree->root) {
					ref = &btree->root;
					if (ref->page == NULL)
						goto done;
					goto descend;
				}

				/*
				 * If restarting from some original position,
				 * repeat the increment or decrement we made at
				 * that time. Otherwise, couple is an internal
				 * page we've acquired after moving from that
				 * starting position and we can treat it as a
				 * new page. This works because we never acquire
				 * a hazard pointer on a leaf page we're not
				 * going to return to our caller, this will quit
				 * work if that ever changes.
				 */
				WT_ASSERT(session,
				    couple == couple_orig ||
				    WT_PAGE_IS_INTERNAL(couple->page));
				ref = couple;
				__wt_page_refp(session, ref, &pindex, &slot);
				if (couple == couple_orig)
					break;
			}
			WT_ERR(ret);

			/*
			 * A new page: configure for traversal of any internal
			 * page's children, else return the leaf page.
			 */
descend:		couple = ref;
			page = ref->page;
			if (page->type == WT_PAGE_ROW_INT ||
			    page->type == WT_PAGE_COL_INT) {
				pindex = WT_INTL_INDEX_COPY(page);
				slot = prev ? pindex->entries - 1 : 0;
			} else {
				*refp = ref;
				goto done;
			}
		}
	}

done:
err:	WT_LEAVE_PAGE_INDEX(session);
	return (ret);
}
Example #4
0
/*
 * __curjoin_entry_in_range --
 *	Check if a key is in the range specified by the entry, returning
 *	WT_NOTFOUND if not.
 */
static int
__curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
    WT_ITEM *curkey, WT_CURSOR_JOIN_ITER *iter)
{
	WT_COLLATOR *collator;
	WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
	u_int pos;
	int cmp;
	bool disjunction, passed;

	collator = (entry->index != NULL) ? entry->index->collator : NULL;
	endmax = &entry->ends[entry->ends_next];
	disjunction = F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION);

	/*
	 * The iterator may have already satisfied some endpoint conditions.
	 * If so and we're a disjunction, we're done.  If so and we're a
	 * conjunction, we can start past the satisfied conditions.
	 */
	if (iter == NULL)
		pos = 0;
	else {
		if (disjunction && iter->end_skip)
			return (0);
		pos = iter->end_pos + iter->end_skip;
	}

	for (end = &entry->ends[pos]; end < endmax; end++) {
		WT_RET(__wt_compare(session, collator, curkey, &end->key,
		    &cmp));
		switch (WT_CURJOIN_END_RANGE(end)) {
		case WT_CURJOIN_END_EQ:
			passed = (cmp == 0);
			break;

		case WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ:
			passed = (cmp >= 0);
			WT_ASSERT(session, iter == NULL);
			break;

		case WT_CURJOIN_END_GT:
			passed = (cmp > 0);
			if (passed && iter != NULL && pos == 0)
				iter->end_skip = 1;
			break;

		case WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ:
			passed = (cmp <= 0);
			break;

		case WT_CURJOIN_END_LT:
			passed = (cmp < 0);
			break;

		WT_ILLEGAL_VALUE(session, WT_CURJOIN_END_RANGE(end));
		}

		if (!passed) {
			if (iter != NULL &&
			    (iter->is_equal ||
			    F_ISSET(end, WT_CURJOIN_END_LT))) {
				WT_RET(__curjoin_iter_bump(iter));
				return (WT_NOTFOUND);
			}
			if (!disjunction)
				return (WT_NOTFOUND);
			iter = NULL;
		} else if (disjunction)
			break;
	}
	if (disjunction && end == endmax)
		return (WT_NOTFOUND);
	return (0);
}
Example #5
0
/*
 * __curjoin_init_bloom --
 *	Populate Bloom filters
 */
static int
__curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_CURSOR_JOIN_ENTRY *entry, WT_BLOOM *bloom)
{
	WT_COLLATOR *collator;
	WT_CURSOR *c;
	WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
	WT_DECL_ITEM(uribuf);
	WT_DECL_RET;
	WT_ITEM curkey, curvalue;
	size_t size;
	u_int skip;
	int cmp;
	const char *uri;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };

	c = NULL;
	skip = 0;

	if (entry->index != NULL)
		/*
		 * Open the raw index.  We're avoiding any references
		 * to the main table, they may be expensive.
		 */
		uri = entry->index->source;
	else {
		/*
		 * For joins on the main table, we just need the primary
		 * key for comparison, we don't need any values.
		 */
		size = strlen(cjoin->table->iface.name) + 3;
		WT_ERR(__wt_scr_alloc(session, size, &uribuf));
		WT_ERR(__wt_buf_fmt(session, uribuf, "%s()",
		    cjoin->table->iface.name));
		uri = uribuf->data;
	}
	WT_ERR(__wt_open_cursor(session, uri, &cjoin->iface, raw_cfg, &c));

	/* Initially position the cursor if necessary. */
	endmax = &entry->ends[entry->ends_next];
	if ((end = &entry->ends[0]) < endmax) {
		if (F_ISSET(end, WT_CURJOIN_END_GT) ||
		    WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ) {
			WT_ERR(__wt_cursor_dup_position(end->cursor, c));
			if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE)
				skip = 1;
		} else if (F_ISSET(end, WT_CURJOIN_END_LT)) {
			if ((ret = c->next(c)) == WT_NOTFOUND)
				goto done;
			WT_ERR(ret);
		} else
			WT_PANIC_ERR(session, EINVAL,
			    "fatal error in join cursor position state");
	}
	collator = (entry->index == NULL) ? NULL : entry->index->collator;
	while (ret == 0) {
		WT_ERR(c->get_key(c, &curkey));
		entry->stats.iterated++;
		if (entry->index != NULL) {
			/*
			 * Repack so it's comparable to the
			 * reference endpoints.
			 */
			WT_ERR(__wt_struct_repack(session,
			    c->key_format,
			    (entry->repack_format != NULL ?
			    entry->repack_format : entry->index->idxkey_format),
			    &c->key, &curkey));
		}
		for (end = &entry->ends[skip]; end < endmax; end++) {
			WT_ERR(__wt_compare(session, collator, &curkey,
			    &end->key, &cmp));
			if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) {
				/* if condition satisfied, insert immediately */
				switch (WT_CURJOIN_END_RANGE(end)) {
				case WT_CURJOIN_END_EQ:
					if (cmp == 0)
						goto insert;
					break;
				case WT_CURJOIN_END_GT:
					if (cmp > 0) {
						/* skip this check next time */
						skip = entry->ends_next;
						goto insert;
					}
					break;
				case WT_CURJOIN_END_GE:
					if (cmp >= 0)
						goto insert;
					break;
				case WT_CURJOIN_END_LT:
					if (cmp < 0)
						goto insert;
					break;
				case WT_CURJOIN_END_LE:
					if (cmp <= 0)
						goto insert;
					break;
				}
			} else if (!F_ISSET(end, WT_CURJOIN_END_LT)) {
				if (cmp < 0 || (cmp == 0 &&
				    !F_ISSET(end, WT_CURJOIN_END_EQ)))
					goto advance;
				if (cmp > 0) {
					if (F_ISSET(end, WT_CURJOIN_END_GT))
						skip = 1;
					else
						goto done;
				}
			} else {
				if (cmp > 0 || (cmp == 0 &&
				    !F_ISSET(end, WT_CURJOIN_END_EQ)))
					goto done;
			}
		}
		/*
		 * Either it's a disjunction that hasn't satisfied any
		 * condition, or it's a conjunction that has satisfied all
		 * conditions.
		 */
		if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
			goto advance;
insert:
		if (entry->index != NULL) {
			curvalue.data =
			    (unsigned char *)curkey.data + curkey.size;
			WT_ASSERT(session, c->key.size > curkey.size);
			curvalue.size = c->key.size - curkey.size;
		}
		else
			WT_ERR(c->get_key(c, &curvalue));
		__wt_bloom_insert(bloom, &curvalue);
		entry->stats.bloom_insert++;
advance:
		if ((ret = c->next(c)) == WT_NOTFOUND)
			break;
	}
done:
	WT_ERR_NOTFOUND_OK(ret);

err:	if (c != NULL)
		WT_TRET(c->close(c));
	__wt_scr_free(session, &uribuf);
	return (ret);
}
Example #6
0
/*
 * __wt_schema_project_merge --
 *	Given list of cursors and a projection, build a buffer containing the
 *	column values read from the cursors.
 */
int
__wt_schema_project_merge(WT_SESSION_IMPL *session,
    WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value)
{
	WT_CURSOR *c;
	WT_DECL_PACK(pack);
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_PACK_VALUE(vpv);
	WT_ITEM *buf;
	WT_PACK vpack;
	u_long arg;
	char *proj;
	const uint8_t *p, *end;
	uint8_t *vp;
	size_t len;

	p = end = NULL;		/* -Wuninitialized */

	WT_RET(__wt_buf_init(session, value, 0));
	WT_RET(__pack_init(session, &vpack, vformat));

	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = buf->data;
			end = p + buf->size;
			continue;
		}

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_NEXT:
			case WT_PROJ_SKIP:
			case WT_PROJ_REUSE:
				WT_RET(__pack_next(&pack, &pv));
				WT_RET(__unpack_read(session, &pv,
				    &p, (size_t)(end - p)));
				/* Only copy the value out once. */
				if (*proj != WT_PROJ_NEXT)
					break;

				WT_RET(__pack_next(&vpack, &vpv));
				/* Make sure the types are compatible. */
				WT_ASSERT(session,
				    __wt_tolower((u_char)pv.type) ==
				    __wt_tolower((u_char)vpv.type));
				vpv.u = pv.u;
				WT_RET(__pack_size(session, &vpv, &len));
				WT_RET(__wt_buf_grow(session,
				    value, value->size + len));
				vp = (uint8_t *)value->mem + value->size;
				WT_RET(__pack_write(session, &vpv, &vp, len));
				value->size += len;
				break;
			}
		}
	}

	return (0);
}
/*
 * __wt_btcur_prev --
 *	Move to the previous record in the tree.
 */
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	uint32_t flags;
	int newpage;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_STAT_FAST_CONN_INCR(session, cursor_prev);
	WT_STAT_FAST_DATA_INCR(session, cursor_prev);

	flags = WT_READ_PREV | WT_READ_SKIP_INTL;	/* Tree walk flags. */
	if (truncating)
		LF_SET(WT_READ_TRUNCATE);

	WT_RET(__cursor_func_init(cbt, 0));

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
		__wt_btcur_iterate_setup(cbt, 0);

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the previous page, until we reach the start
	 * of the file.
	 */
	page = cbt->ref == NULL ? NULL : cbt->ref->page;
	for (newpage = 0;; newpage = 1) {
		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
			newpage = 1;
		}
		if (page != NULL) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_prev(cbt, newpage);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;
		}

		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
		page = cbt->ref->page;
		WT_ASSERT(session,
		    page->type != WT_PAGE_COL_INT &&
		    page->type != WT_PAGE_ROW_INT);

		/*
		 * The last page in a column-store has appended entries.
		 * We handle it separately from the usual cursor code:
		 * it's only that one page and it's in a simple format.
		 */
		if (page->type != WT_PAGE_ROW_LEAF &&
		    (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
			F_SET(cbt, WT_CBT_ITERATE_APPEND);
	}

err:	if (ret != 0)
		WT_TRET(__cursor_error_resolve(cbt));
	return (ret);
}
Example #8
0
/*
 * __wt_lsm_meta_write --
 *	Write the metadata for an LSM tree.
 */
int
__wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	u_int i;
	int first;

	WT_RET(__wt_scr_alloc(session, 0, &buf));
	WT_ERR(__wt_buf_fmt(session, buf,
	    "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)",
	    lsm_tree->key_format, lsm_tree->value_format,
	    lsm_tree->bloom_config, lsm_tree->file_config));
	if (lsm_tree->collator_name != NULL)
		WT_ERR(__wt_buf_catfmt(
		    session, buf, ",collator=%s", lsm_tree->collator_name));
	WT_ERR(__wt_buf_catfmt(session, buf,
	    ",last=%" PRIu32
	    ",chunk_count_limit=%" PRIu32
	    ",chunk_max=%" PRIu64
	    ",chunk_size=%" PRIu64
	    ",auto_throttle=%" PRIu32
	    ",merge_max=%" PRIu32
	    ",merge_min=%" PRIu32
	    ",bloom=%" PRIu32
	    ",bloom_bit_count=%" PRIu32
	    ",bloom_hash_count=%" PRIu32,
	    lsm_tree->last, lsm_tree->chunk_count_limit,
	    lsm_tree->chunk_max, lsm_tree->chunk_size,
	    F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0,
	    lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count));
	WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=["));
	for (i = 0; i < lsm_tree->nchunks; i++) {
		chunk = lsm_tree->chunk[i];
		if (i > 0)
			WT_ERR(__wt_buf_catfmt(session, buf, ","));
		WT_ERR(__wt_buf_catfmt(session, buf, "id=%" PRIu32, chunk->id));
		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_buf_catfmt(session, buf, ",bloom"));
		if (chunk->size != 0)
			WT_ERR(__wt_buf_catfmt(session, buf,
			    ",chunk_size=%" PRIu64, chunk->size));
		if (chunk->count != 0)
			WT_ERR(__wt_buf_catfmt(
			    session, buf, ",count=%" PRIu64, chunk->count));
		WT_ERR(__wt_buf_catfmt(
		    session, buf, ",generation=%" PRIu32, chunk->generation));
	}
	WT_ERR(__wt_buf_catfmt(session, buf, "]"));
	WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=["));
	first = 1;
	for (i = 0; i < lsm_tree->nold_chunks; i++) {
		chunk = lsm_tree->old_chunks[i];
		WT_ASSERT(session, chunk != NULL);
		if (first)
			first = 0;
		else
			WT_ERR(__wt_buf_catfmt(session, buf, ","));
		WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri));
		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_buf_catfmt(
			    session, buf, ",bloom=\"%s\"", chunk->bloom_uri));
	}
	WT_ERR(__wt_buf_catfmt(session, buf, "]"));
	ret = __wt_metadata_update(session, lsm_tree->name, buf->data);
	WT_ERR(ret);

err:	__wt_scr_free(session, &buf);
	return (ret);
}
Example #9
0
/*
 * __wt_btcur_prev --
 *	Move to the previous record in the tree.
 */
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	uint32_t flags;
	bool newpage;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_STAT_FAST_CONN_INCR(session, cursor_prev);
	WT_STAT_FAST_DATA_INCR(session, cursor_prev);

	flags = WT_READ_PREV | WT_READ_SKIP_INTL;	/* Tree walk flags. */
	if (truncating)
		LF_SET(WT_READ_TRUNCATE);

	WT_RET(__cursor_func_init(cbt, false));

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
		__wt_btcur_iterate_setup(cbt);

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the previous page, until we reach the start
	 * of the file.
	 */
	for (newpage = false;; newpage = true) {
		page = cbt->ref == NULL ? NULL : cbt->ref->page;
		WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page));

		/*
		 * The last page in a column-store has appended entries.
		 * We handle it separately from the usual cursor code:
		 * it's only that one page and it's in a simple format.
		 */
		if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF &&
		    (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
			F_SET(cbt, WT_CBT_ITERATE_APPEND);

		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
			newpage = true;
		}
		if (page != NULL) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_prev(cbt, newpage);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;
		}

		/*
		 * If we saw a lot of deleted records on this page, or we went
		 * all the way through a page and only saw deleted records, try
		 * to evict the page when we release it.  Otherwise repeatedly
		 * deleting from the beginning of a tree can have quadratic
		 * performance.  Take care not to force eviction of pages that
		 * are genuinely empty, in new trees.
		 */
		if (page != NULL &&
		    (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD ||
		    (newpage && cbt->page_deleted_count > 0)))
			__wt_page_evict_soon(page);
		cbt->page_deleted_count = 0;

		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
	}
#ifdef HAVE_DIAGNOSTIC
	if (ret == 0)
		WT_ERR(__wt_cursor_key_order_check(session, cbt, false));
#endif

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
Example #10
0
/*
 * __wt_lsm_tree_throttle --
 *	Calculate whether LSM updates need to be throttled. Must be called
 *	with the LSM tree lock held.
 */
void
__wt_lsm_tree_throttle(
    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only)
{
	WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
	uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
	uint32_t in_memory, gen0_chunks;

	/* Never throttle in small trees. */
	if (lsm_tree->nchunks < 3) {
		lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
		return;
	}

	cache_sz = S2C(session)->cache_size;

	/*
	 * In the steady state, we expect that the checkpoint worker thread
	 * will keep up with inserts.  If not, throttle the insert rate to
	 * avoid filling the cache with in-memory chunks.  Threads sleep every
	 * 100 operations, so take that into account in the calculation.
	 *
	 * Also throttle based on whether merge threads are keeping up.  If
	 * there are enough chunks that have never been merged we slow down
	 * inserts so that merges have some chance of keeping up.
	 *
	 * Count the number of in-memory chunks, the number of unmerged chunk
	 * on disk, and find the most recent on-disk chunk (if any).
	 */
	record_count = 1;
	gen0_chunks = in_memory = 0;
	ondisk = NULL;
	for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
	    cp >= lsm_tree->chunk;
	    --cp)
		if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
			record_count += (*cp)->count;
			++in_memory;
		} else {
			/*
			 * Assign ondisk to the last chunk that has been
			 * flushed since the tree was last opened (i.e it's on
			 * disk and stable is not set).
			 */
			if (ondisk == NULL &&
			    ((*cp)->generation == 0 &&
			    !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
				ondisk = *cp;

			if ((*cp)->generation == 0 &&
			    !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
				++gen0_chunks;
		}

	last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];

	/* Checkpoint throttling, based on the number of in-memory chunks. */
	if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
		lsm_tree->ckpt_throttle = 0;
	else if (decrease_only)
		; /* Nothing to do */
	else if (ondisk == NULL) {
		/*
		 * No checkpoint has completed this run.  Keep slowing down
		 * inserts until one does.
		 */
		lsm_tree->ckpt_throttle =
		    WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
	} else {
		WT_ASSERT(session,
		    WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0);
		timediff =
		    WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts);
		lsm_tree->ckpt_throttle =
		    (long)((in_memory - 2) * timediff / (20 * record_count));

		/*
		 * Get more aggressive as the number of in memory chunks
		 * consumes a large proportion of the cache. In memory chunks
		 * are allowed to grow up to twice as large as the configured
		 * value when checkpoints aren't keeping up. That worst case
		 * is when this calculation is relevant.
		 * There is nothing particularly special about the chosen
		 * multipliers.
		 */
		cache_used = in_memory * lsm_tree->chunk_size * 2;
		if (cache_used > cache_sz * 0.8)
			lsm_tree->ckpt_throttle *= 5;
	}

	/*
	 * Merge throttling, based on the number of on-disk, level 0 chunks.
	 *
	 * Don't throttle if the tree has less than a single level's number
	 * of chunks.
	 */
	if (lsm_tree->nchunks < lsm_tree->merge_max)
		lsm_tree->merge_throttle = 0;
	else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
		WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle);
	else if (!decrease_only)
		WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle);

	/* Put an upper bound of 1s on both throttle calculations. */
	lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle);
	lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle);

	/*
	 * Update our estimate of how long each in-memory chunk stays active.
	 * Filter out some noise by keeping a weighted history of the
	 * calculated value.  Wait until we have enough chunks that we can
	 * check that the new value is sane: otherwise, after a long idle
	 * period, we can calculate a crazy value.
	 */
	if (in_memory > 1 && ondisk != NULL) {
		prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
		WT_ASSERT(session, prev_chunk->generation == 0);
		WT_ASSERT(session, WT_TIMECMP(
		    last_chunk->create_ts, prev_chunk->create_ts) >= 0);
		timediff =
		    WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts);
		WT_ASSERT(session,
		    WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0);
		oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts);
		if (timediff < 10 * oldtime)
			lsm_tree->chunk_fill_ms =
			    (3 * lsm_tree->chunk_fill_ms +
			    timediff / 1000000) / 4;
	}
}
Example #11
0
/*
 * __wt_block_write_off --
 *	Write a buffer into a block, returning the block's offset, size and
 * checksum.
 */
int
__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
    int data_cksum, int caller_locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	size_t align_size;
	wt_off_t offset;
	int local_locked;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	fh = block->fh;
	local_locked = 0;

	/* Buffers should be aligned for writing. */
	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL,
		    "direct I/O check: write buffer incorrectly allocated");
	}

	/*
	 * Align the size to an allocation unit.
	 *
	 * The buffer must be big enough for us to zero to the next allocsize
	 * boundary, this is one of the reasons the btree layer must find out
	 * from the block-manager layer the maximum size of the eventual write.
	 */
	align_size = WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer incorrectly allocated");
	}
	if (align_size > UINT32_MAX) {
		WT_ASSERT(session, align_size <= UINT32_MAX);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer too large to write");
	}

	/* Zero out any unused bytes at the end of the buffer. */
	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);

	/*
	 * Set the disk size so we don't have to incrementally read blocks
	 * during salvage.
	 */
	blk->disk_size = WT_STORE_SIZE(align_size);

	/*
	 * Update the block's checksum: if our caller specifies, checksum the
	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
	 * bytes.  The assumption is applications with good compression support
	 * turn off checksums and assume corrupted blocks won't decompress
	 * correctly.  However, if compression failed to shrink the block, the
	 * block wasn't compressed, in which case our caller will tell us to
	 * checksum the data to detect corruption.   If compression succeeded,
	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
	 * because they're not compressed, both to give salvage a quick test
	 * of whether a block is useful and to give us a test so we don't lose
	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
	 */
	blk->flags = 0;
	if (data_cksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);
	blk->cksum = 0;
	blk->cksum = __wt_cksum(
	    buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);

	if (!caller_locked) {
		WT_RET(__wt_block_ext_prealloc(session, 5));
		__wt_spin_lock(session, &block->live_lock);
		local_locked = 1;
	}
	ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);

	/*
	 * Extend the file in chunks.  We want to limit the number of threads
	 * extending the file at the same time, so choose the one thread that's
	 * crossing the extended boundary.  We don't extend newly created files,
	 * and it's theoretically possible we might wait so long our extension
	 * of the file is passed by another thread writing single blocks, that's
	 * why there's a check in case the extended file size becomes too small:
	 * if the file size catches up, every thread tries to extend it.
	 *
	 * File extension may require locking: some variants of the system call
	 * used to extend the file initialize the extended space. If a writing
	 * thread races with the extending thread, the extending thread might
	 * overwrite already written data, and that would be very, very bad.
	 *
	 * Some variants of the system call to extend the file fail at run-time
	 * based on the filesystem type, fall back to ftruncate in that case,
	 * and remember that ftruncate requires locking.
	 */
	if (ret == 0 &&
	    fh->extend_len != 0 &&
	    (fh->extend_size <= fh->size ||
	    (offset + fh->extend_len <= fh->extend_size &&
	    offset +
	    fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) {
		fh->extend_size = offset + fh->extend_len * 2;
		if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) {
			/*
			 * Release any locally acquired lock if it's not needed
			 * to extend the file, extending the file might require
			 * updating file metadata, which can be slow. (It may be
			 * a bad idea to configure for file extension on systems
			 * that require locking over the extend call.)
			 */
			if (!fh->fallocate_requires_locking && local_locked) {
				__wt_spin_unlock(session, &block->live_lock);
				local_locked = 0;
			}

			/* Extend the file. */
			if ((ret = __wt_fallocate(session,
			    fh, offset, fh->extend_len * 2)) == ENOTSUP) {
				ret = 0;
				goto extend_truncate;
			}
		} else {
extend_truncate:	/*
			 * We may have a caller lock or a locally acquired lock,
			 * but we need a lock to call ftruncate.
			 */
			if (!caller_locked && local_locked == 0) {
				__wt_spin_lock(session, &block->live_lock);
				local_locked = 1;
			}
			/*
			 * The truncate might fail if there's a file mapping
			 * (if there's an open checkpoint on the file), that's
			 * OK.
			 */
			if ((ret = __wt_ftruncate(
			    session, fh, offset + fh->extend_len * 2)) == EBUSY)
				ret = 0;
		}
	}
	/* Release any locally acquired lock. */
	if (local_locked) {
		__wt_spin_unlock(session, &block->live_lock);
		local_locked = 0;
	}
	WT_RET(ret);

	/* Write the block. */
	if ((ret =
	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
		if (!caller_locked)
			__wt_spin_lock(session, &block->live_lock);
		WT_TRET(__wt_block_off_free(
		    session, block, offset, (wt_off_t)align_size));
		if (!caller_locked)
			__wt_spin_unlock(session, &block->live_lock);
		WT_RET(ret);
	}

#ifdef HAVE_SYNC_FILE_RANGE
	/*
	 * Optionally schedule writes for dirty pages in the system buffer
	 * cache, but only if the current session can wait.
	 */
	if (block->os_cache_dirty_max != 0 &&
	    (block->os_cache_dirty += align_size) > block->os_cache_dirty_max &&
	    __wt_session_can_wait(session)) {
		block->os_cache_dirty = 0;
		WT_RET(__wt_fsync_async(session, fh));
	}
#endif
#ifdef HAVE_POSIX_FADVISE
	/* Optionally discard blocks from the system buffer cache. */
	if (block->os_cache_max != 0 &&
	    (block->os_cache += align_size) > block->os_cache_max) {
		block->os_cache = 0;
		if ((ret = posix_fadvise(fh->fd,
		    (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fadvise", block->name);
	}
#endif
	WT_STAT_FAST_CONN_INCR(session, block_write);
	WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);

	WT_RET(__wt_verbose(session, WT_VERB_WRITE,
	    "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32,
	    (uintmax_t)offset, (uintmax_t)align_size, blk->cksum));

	*offsetp = offset;
	*sizep = WT_STORE_SIZE(align_size);
	*cksump = blk->cksum;

	return (ret);
}
Example #12
0
/*
 * __lsm_tree_open --
 *	Open an LSM tree structure.
 */
static int
__lsm_tree_open(
    WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;

	conn = S2C(session);
	lsm_tree = NULL;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));

	/* Start the LSM manager thread if it isn't running. */
	if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1))
		WT_RET(__wt_lsm_manager_start(session));

	/* Make sure no one beat us to it. */
	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
		if (strcmp(uri, lsm_tree->name) == 0) {
			*treep = lsm_tree;
			return (0);
		}

	/* Try to open the tree. */
	WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
	WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree"));

	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));

	WT_ERR(__wt_lsm_meta_read(session, lsm_tree));

	/*
	 * Sanity check the configuration. Do it now since this is the first
	 * time we have the LSM tree configuration.
	 */
	WT_ERR(__lsm_tree_open_check(session, lsm_tree));

	if (lsm_tree->nchunks == 0) {
		F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
		WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
	}

	/* Set the generation number so cursors are opened on first usage. */
	lsm_tree->dsk_gen = 1;

	/*
	 * Setup reference counting. Use separate reference counts for tree
	 * handles and queue entries, so that queue entries don't interfere
	 * with getting handles exclusive.
	 */
	lsm_tree->refcnt = 1;
	lsm_tree->queue_ref = 0;

	/* Set a flush timestamp as a baseline. */
	WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts));

	/* Now the tree is setup, make it visible to others. */
	TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
	F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN);

	*treep = lsm_tree;

	if (0) {
err:		WT_TRET(__lsm_tree_discard(session, lsm_tree));
	}
	return (ret);
}
Example #13
0
/*
 * __wt_lsm_compact --
 *	Compact an LSM tree called via __wt_schema_worker.
 */
int
__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	time_t begin, end;
	uint64_t progress;
	int i, compacting, flushing, locked, ref;

	compacting = flushing = locked = ref = 0;
	chunk = NULL;
	/*
	 * This function is applied to all matching sources: ignore anything
	 * that is not an LSM tree.
	 */
	if (!WT_PREFIX_MATCH(name, "lsm:"))
		return (0);

	/* Tell __wt_schema_worker not to look inside the LSM tree. */
	*skip = 1;

	WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree));

	if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
		WT_ERR_MSG(session, EINVAL,
		    "LSM compaction requires active merge threads");

	WT_ERR(__wt_seconds(session, &begin));

	/*
	 * Compacting has two distinct phases.
	 * 1.  All in-memory chunks up to and including the current
	 * current chunk must be flushed.  Normally, the flush code
	 * does not flush the last, in-use chunk, so we set a force
	 * flag to include that last chunk.  We monitor the state of the
	 * last chunk and periodically push another forced flush work
	 * unit until it is complete.
	 * 2.  After all flushing is done, we move onto the merging
	 * phase for compaction.  Again, we monitor the state and
	 * continue to push merge work units until all merging is done.
	 */

	/* Lock the tree: single-thread compaction. */
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = 1;

	/* Clear any merge throttle: compact throws out that calculation. */
	lsm_tree->merge_throttle = 0;
	lsm_tree->merge_aggressiveness = 0;
	progress = lsm_tree->merge_progressing;

	/* If another thread started a compact on this tree, we're done. */
	if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
		goto err;

	/*
	 * Set the switch transaction on the current chunk, if it
	 * hasn't been set before.  This prevents further writes, so it
	 * can be flushed by the checkpoint worker.
	 */
	if (lsm_tree->nchunks > 0 &&
	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) {
		if (chunk->switch_txn == WT_TXN_NONE)
			chunk->switch_txn = __wt_txn_new_id(session);
		/*
		 * If we have a chunk, we want to look for it to be on-disk.
		 * So we need to add a reference to keep it available.
		 */
		(void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
		ref = 1;
	}

	locked = 0;
	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));

	if (chunk != NULL) {
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "Compact force flush %s flags 0x%" PRIx32
		    " chunk %u flags 0x%"
		    PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags));
		flushing = 1;
		/*
		 * Make sure the in-memory chunk gets flushed do not push a
		 * switch, because we don't want to create a new in-memory
		 * chunk if the tree is being used read-only now.
		 */
		WT_ERR(__wt_lsm_manager_push_entry(session,
		    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
	} else {
		/*
		 * If there is no chunk to flush, go straight to the
		 * compacting state.
		 */
		compacting = 1;
		progress = lsm_tree->merge_progressing;
		F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "COMPACT: Start compacting %s", lsm_tree->name));
	}

	/* Wait for the work unit queues to drain. */
	while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
		/*
		 * The flush flag is cleared when the chunk has been flushed.
		 * Continue to push forced flushes until the chunk is on disk.
		 * Once it is on disk move to the compacting phase.
		 */
		if (flushing) {
			WT_ASSERT(session, chunk != NULL);
			if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
				WT_ERR(__wt_verbose(session,
				    WT_VERB_LSM,
				    "Compact flush done %s chunk %u.  "
				    "Start compacting progress %" PRIu64,
				    name, chunk->id,
				    lsm_tree->merge_progressing));
				(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
				flushing = ref = 0;
				compacting = 1;
				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
				progress = lsm_tree->merge_progressing;
			} else {
				WT_ERR(__wt_verbose(session, WT_VERB_LSM,
				    "Compact flush retry %s chunk %u",
				    name, chunk->id));
				WT_ERR(__wt_lsm_manager_push_entry(session,
				    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
				    lsm_tree));
			}
		}

		/*
		 * The compacting flag is cleared when no merges can be done.
		 * Ensure that we push through some aggressive merges before
		 * stopping otherwise we might not do merges that would
		 * span chunks with different generations.
		 */
		if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
			if (lsm_tree->merge_aggressiveness < 10 ||
			    (progress < lsm_tree->merge_progressing) ||
			    lsm_tree->merge_syncing) {
				progress = lsm_tree->merge_progressing;
				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
				lsm_tree->merge_aggressiveness = 10;
			} else
				break;
		}
		__wt_sleep(1, 0);
		WT_ERR(__wt_seconds(session, &end));
		if (session->compact->max_time > 0 &&
		    session->compact->max_time < (uint64_t)(end - begin)) {
			WT_ERR(ETIMEDOUT);
		}
		/*
		 * Push merge operations while they are still getting work
		 * done. If we are pushing merges, make sure they are
		 * aggressive, to avoid duplicating effort.
		 */
		if (compacting)
#define	COMPACT_PARALLEL_MERGES	5
			for (i = lsm_tree->queue_ref;
			    i < COMPACT_PARALLEL_MERGES; i++) {
				lsm_tree->merge_aggressiveness = 10;
				WT_ERR(__wt_lsm_manager_push_entry(
				    session, WT_LSM_WORK_MERGE, 0, lsm_tree));
			}
	}
err:
	/* Ensure anything we set is cleared. */
	if (ref)
		(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
	if (compacting) {
		F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
		lsm_tree->merge_aggressiveness = 0;
	}
	if (locked)
		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));

	WT_TRET(__wt_verbose(session, WT_VERB_LSM,
	    "Compact %s complete, return %d", name, ret));

	__wt_lsm_tree_release(session, lsm_tree);
	return (ret);

}
Example #14
0
/*
 * __verify_dsk_row --
 *	Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it.
 */
static int
__verify_dsk_row(
    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_ITEM(current);
	WT_DECL_ITEM(last_ovfl);
	WT_DECL_ITEM(last_pfx);
	WT_DECL_RET;
	WT_ITEM *last;
	enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
	void *huffman;
	uint32_t cell_num, cell_type, i, key_cnt, prefix;
	uint8_t *end;
	int cmp;

	btree = S2BT(session);
	bm = btree->bm;
	unpack = &_unpack;
	huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key;

	WT_ERR(__wt_scr_alloc(session, 0, &current));
	WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
	WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
	last = last_ovfl;

	end = (uint8_t *)dsk + dsk->mem_size;

	last_cell_type = FIRST;
	cell_num = 0;
	key_cnt = 0;
	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
		++cell_num;

		/* Carefully unpack the cell. */
		if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
			ret = __err_cell_corrupted(session, cell_num, addr);
			goto err;
		}

		/* Check the raw and collapsed cell types. */
		WT_ERR(__err_cell_type(
		    session, cell_num, addr, unpack->raw, dsk->type));
		WT_ERR(__err_cell_type(
		    session, cell_num, addr, unpack->type, dsk->type));
		cell_type = unpack->type;

		/*
		 * Check ordering relationships between the WT_CELL entries.
		 * For row-store internal pages, check for:
		 *	two values in a row,
		 *	two keys in a row,
		 *	a value as the first cell on a page.
		 * For row-store leaf pages, check for:
		 *	two values in a row,
		 *	a value as the first cell on a page.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
		case WT_CELL_KEY_OVFL:
			++key_cnt;
			switch (last_cell_type) {
			case FIRST:
			case WAS_VALUE:
				break;
			case WAS_KEY:
				if (dsk->type == WT_PAGE_ROW_LEAF)
					break;
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent keys",
				    cell_num - 1, addr);
			}
			last_cell_type = WAS_KEY;
			break;
		case WT_CELL_ADDR_DEL:
		case WT_CELL_ADDR_INT:
		case WT_CELL_ADDR_LEAF:
		case WT_CELL_ADDR_LEAF_NO:
		case WT_CELL_VALUE:
		case WT_CELL_VALUE_OVFL:
			switch (last_cell_type) {
			case FIRST:
				WT_ERR_VRFY(session,
				    "page at %s begins with a value", addr);
			case WAS_KEY:
				break;
			case WAS_VALUE:
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent values",
				    cell_num - 1, addr);
			}
			last_cell_type = WAS_VALUE;
			break;
		}

		/* Check if any referenced item has a valid address. */
		switch (cell_type) {
		case WT_CELL_ADDR_DEL:
		case WT_CELL_ADDR_INT:
		case WT_CELL_ADDR_LEAF:
		case WT_CELL_ADDR_LEAF_NO:
		case WT_CELL_KEY_OVFL:
		case WT_CELL_VALUE_OVFL:
			if (!bm->addr_valid(bm,
			    session, unpack->data, unpack->size))
				goto eof;
			break;
		}

		/*
		 * Remaining checks are for key order and prefix compression.
		 * If this cell isn't a key, we're done, move to the next cell.
		 * If this cell is an overflow item, instantiate the key and
		 * compare it with the last key. Otherwise, we have to deal with
		 * prefix compression.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
			break;
		case WT_CELL_KEY_OVFL:
			WT_ERR(__wt_dsk_cell_data_ref(
			    session, dsk->type, unpack, current));
			goto key_compare;
		default:
			/* Not a key -- continue with the next cell. */
			continue;
		}

		/*
		 * Prefix compression checks.
		 *
		 * Confirm the first non-overflow key on a page has a zero
		 * prefix compression count.
		 */
		prefix = unpack->prefix;
		if (last_pfx->size == 0 && prefix != 0)
			WT_ERR_VRFY(session,
			    "the %" PRIu32 " key on page at %s is the first "
			    "non-overflow key on the page and has a non-zero "
			    "prefix compression value",
			    cell_num, addr);

		/* Confirm the prefix compression count is possible. */
		if (cell_num > 1 && prefix > last->size)
			WT_ERR_VRFY(session,
			    "key %" PRIu32 " on page at %s has a prefix "
			    "compression count of %" PRIu32 ", larger than "
			    "the length of the previous key, %" WT_SIZET_FMT,
			    cell_num, addr, prefix, last->size);

		/*
		 * If Huffman decoding required, unpack the cell to build the
		 * key, then resolve the prefix.  Else, we can do it faster
		 * internally because we don't have to shuffle memory around as
		 * much.
		 */
		if (huffman != NULL) {
			WT_ERR(__wt_dsk_cell_data_ref(
			    session, dsk->type, unpack, current));

			/*
			 * If there's a prefix, make sure there's enough buffer
			 * space, then shift the decoded data past the prefix
			 * and copy the prefix into place.  Take care with the
			 * pointers: current->data may be pointing inside the
			 * buffer.
			 */
			if (prefix != 0) {
				WT_ERR(__wt_buf_grow(
				    session, current, prefix + current->size));
				memmove((uint8_t *)current->mem + prefix,
				    current->data, current->size);
				memcpy(current->mem, last->data, prefix);
				current->data = current->mem;
				current->size += prefix;
			}
		} else {
			/*
			 * Get the cell's data/length and make sure we have
			 * enough buffer space.
			 */
			WT_ERR(__wt_buf_init(
			    session, current, prefix + unpack->size));

			/* Copy the prefix then the data into place. */
			if (prefix != 0)
				memcpy(current->mem, last->data, prefix);
			memcpy((uint8_t *)current->mem + prefix, unpack->data,
			    unpack->size);
			current->size = prefix + unpack->size;
		}

key_compare:	/*
		 * Compare the current key against the last key.
		 *
		 * Be careful about the 0th key on internal pages: we only store
		 * the first byte and custom collators may not be able to handle
		 * truncated keys.
		 */
		if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) ||
		    (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) {
			WT_ERR(__wt_compare(
			    session, btree->collator, last, current, &cmp));
			if (cmp >= 0)
				WT_ERR_VRFY(session,
				    "the %" PRIu32 " and %" PRIu32 " keys on "
				    "page at %s are incorrectly sorted",
				    cell_num - 2, cell_num, addr);
		}

		/*
		 * Swap the buffers: last always references the last key entry,
		 * last_pfx and last_ovfl reference the last prefix-compressed
		 * and last overflow key entries.  Current gets pointed to the
		 * buffer we're not using this time around, which is where the
		 * next key goes.
		 */
		last = current;
		if (cell_type == WT_CELL_KEY) {
			current = last_pfx;
			last_pfx = last;
		} else {
			current = last_ovfl;
			last_ovfl = last;
		}
		WT_ASSERT(session, last != current);
	}
Example #15
0
/*
 * __wt_schema_project_in --
 *	Given list of cursors and a projection, read columns from the
 *	application into the dependent cursors.
 */
int
__wt_schema_project_in(WT_SESSION_IMPL *session,
    WT_CURSOR **cp, const char *proj_arg, va_list ap)
{
	WT_CURSOR *c;
	WT_DECL_ITEM(buf);
	WT_DECL_PACK(pack);
	WT_DECL_PACK_VALUE(pv);
	WT_PACK_VALUE old_pv;
	size_t len, offset, old_len;
	u_long arg;
	char *proj;
	uint8_t *p, *end;
	const uint8_t *next;

	p = end = NULL;		/* -Wuninitialized */

	/* Reset any of the buffers we will be setting. */
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);
		if (*proj == WT_PROJ_KEY) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->key, 0));
		} else if (*proj == WT_PROJ_VALUE) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->value, 0));
		}
	}

	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;
		}

		/* We have to get a key or value before any operations. */
		WT_ASSERT(session, buf != NULL);

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_SKIP:
				WT_RET(__pack_next(&pack, &pv));
				/*
				 * A nasty case: if we are inserting
				 * out-of-order, we may reach the end of the
				 * data.  That's okay: we want to append in
				 * that case, and we're positioned to do that.
				 */
				if (p == end) {
					/* Set up an empty value. */
					WT_CLEAR(pv.u);
					if (pv.type == 'S' || pv.type == 's')
						pv.u.s = "";

					WT_RET(__pack_size(session, &pv, &len));
					WT_RET(__wt_buf_grow(session,
					    buf, buf->size + len));
					p = (uint8_t *)buf->mem + buf->size;
					WT_RET(__pack_write(
					    session, &pv, &p, len));
					buf->size += len;
					end = (uint8_t *)buf->mem + buf->size;
				} else if (*proj == WT_PROJ_SKIP)
					WT_RET(__unpack_read(session,
					    &pv, (const uint8_t **)&p,
					    (size_t)(end - p)));
				break;

			case WT_PROJ_NEXT:
				WT_RET(__pack_next(&pack, &pv));
				WT_PACK_GET(session, pv, ap);
				/* FALLTHROUGH */

			case WT_PROJ_REUSE:
				/* Read the item we're about to overwrite. */
				next = p;
				if (p < end) {
					old_pv = pv;
					WT_RET(__unpack_read(session, &old_pv,
					    &next, (size_t)(end - p)));
				}
				old_len = (size_t)(next - p);

				WT_RET(__pack_size(session, &pv, &len));
				offset = WT_PTRDIFF(p, buf->mem);
				WT_RET(__wt_buf_grow(session,
				    buf, buf->size + len));
				p = (uint8_t *)buf->mem + offset;
				end = (uint8_t *)buf->mem + buf->size + len;
				/* Make room if we're inserting out-of-order. */
				if (offset + old_len < buf->size)
					memmove(p + len, p + old_len,
					    buf->size - (offset + old_len));
				WT_RET(__pack_write(session, &pv, &p, len));
				buf->size += len;
				break;

			default:
				WT_RET_MSG(session, EINVAL,
				    "unexpected projection plan: %c",
				    (int)*proj);
			}
		}
	}

	return (0);
}
Example #16
0
/*
 * __wt_txn_recover --
 *	Run recovery.
 */
int
__wt_txn_recover(WT_CONNECTION_IMPL *conn)
{
	WT_CURSOR *metac;
	WT_DECL_RET;
	WT_RECOVERY r;
	WT_SESSION_IMPL *session;
	struct WT_RECOVERY_FILE *metafile;
	char *config;
	int was_backup;

	WT_CLEAR(r);
	INIT_LSN(&r.ckpt_lsn);
	was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0;

	/* We need a real session for recovery. */
	WT_RET(__wt_open_session(conn, NULL, NULL, &session));
	F_SET(session, WT_SESSION_NO_LOGGING);
	r.session = session;

	WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
	WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
	WT_ERR(__wt_metadata_cursor(session, NULL, &metac));
	metafile = &r.files[WT_METAFILE_ID];
	metafile->c = metac;

	/*
	 * First, do a pass through the log to recover the metadata, and
	 * establish the last checkpoint LSN.  Skip this when opening a hot
	 * backup: we already have the correct metadata in that case.
	 */
	if (!was_backup) {
		r.metadata_only = 1;
		if (IS_INIT_LSN(&metafile->ckpt_lsn))
			WT_ERR(__wt_log_scan(session,
			    NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
		else
			WT_ERR(__wt_log_scan(session,
			    &metafile->ckpt_lsn, 0, __txn_log_recover, &r));

		WT_ASSERT(session,
		    LOG_CMP(&r.ckpt_lsn, &conn->log->first_lsn) >= 0);
	}

	/* Scan the metadata to find the live files and their IDs. */
	WT_ERR(__recovery_file_scan(&r));

	/*
	 * We no longer need the metadata cursor: close it to avoid pinning any
	 * resources that could block eviction during recovery.
	 */
	r.files[0].c = NULL;
	WT_ERR(metac->close(metac));

	/*
	 * Now, recover all the files apart from the metadata.
	 * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
	 */
	r.metadata_only = 0;
	WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
	    "Main recovery loop: starting at %u/%" PRIuMAX,
	    r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset));
	if (IS_INIT_LSN(&r.ckpt_lsn))
		WT_ERR(__wt_log_scan(session, NULL,
		    WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
		    __txn_log_recover, &r));
	else
		WT_ERR(__wt_log_scan(session, &r.ckpt_lsn,
		    WT_LOGSCAN_RECOVER,
		    __txn_log_recover, &r));

	conn->next_file_id = r.max_fileid;

	/*
	 * If recovery ran successfully forcibly log a checkpoint so the next
	 * open is fast and keep the metadata up to date with the checkpoint
	 * LSN and archiving.
	 */
	WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));

err:	WT_TRET(__recovery_free(&r));
	__wt_free(session, config);
	WT_TRET(session->iface.close(&session->iface, NULL));

	return (ret);
}
Example #17
0
/*
 * __wt_schema_project_slice --
 *	Given list of cursors and a projection, read columns from the
 *	a raw buffer.
 */
int
__wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp,
    const char *proj_arg, bool key_only, const char *vformat, WT_ITEM *value)
{
	WT_CURSOR *c;
	WT_DECL_ITEM(buf);
	WT_DECL_PACK(pack);
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_PACK_VALUE(vpv);
	WT_PACK vpack;
	u_long arg;
	char *proj;
	uint8_t *end, *p;
	const uint8_t *next, *vp, *vend;
	size_t len, offset, old_len;
	bool skip;

	p = end = NULL;		/* -Wuninitialized */

	WT_RET(__pack_init(session, &vpack, vformat));
	vp = value->data;
	vend = vp + value->size;

	/* Reset any of the buffers we will be setting. */
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);
		if (*proj == WT_PROJ_KEY) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->key, 0));
		} else if (*proj == WT_PROJ_VALUE && !key_only) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->value, 0));
		}
	}

	skip = key_only;
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			skip = false;
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			skip = key_only;
			if (skip)
				continue;
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;
		}

		/* We have to get a key or value before any operations. */
		WT_ASSERT(session, skip || buf != NULL);

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_SKIP:
				if (skip)
					break;
				WT_RET(__pack_next(&pack, &pv));

				/*
				 * A nasty case: if we are inserting
				 * out-of-order, append a zero value to keep
				 * the buffer in the correct format.
				 */
				if (p == end) {
					/* Set up an empty value. */
					WT_CLEAR(pv.u);
					if (pv.type == 'S' || pv.type == 's')
						pv.u.s = "";

					WT_RET(__pack_size(session, &pv, &len));
					WT_RET(__wt_buf_grow(session,
					    buf, buf->size + len));
					p = (uint8_t *)buf->data + buf->size;
					WT_RET(__pack_write(
					    session, &pv, &p, len));
					end = p;
					buf->size += len;
				} else
					WT_RET(__unpack_read(session,
					    &pv, (const uint8_t **)&p,
					    (size_t)(end - p)));
				break;

			case WT_PROJ_NEXT:
				WT_RET(__pack_next(&vpack, &vpv));
				WT_RET(__unpack_read(session, &vpv,
				    &vp, (size_t)(vend - vp)));
				/* FALLTHROUGH */

			case WT_PROJ_REUSE:
				if (skip)
					break;

				/*
				 * Read the item we're about to overwrite.
				 *
				 * There is subtlety here: the value format
				 * may not exactly match the cursor's format.
				 * In particular, we need lengths with raw
				 * columns in the middle of a packed struct,
				 * but not if they are at the end of a struct.
				 */
				WT_RET(__pack_next(&pack, &pv));

				next = p;
				if (p < end)
					WT_RET(__unpack_read(session, &pv,
					    &next, (size_t)(end - p)));
				old_len = (size_t)(next - p);

				/* Make sure the types are compatible. */
				WT_ASSERT(session,
				    __wt_tolower((u_char)pv.type) ==
				    __wt_tolower((u_char)vpv.type));
				pv.u = vpv.u;

				WT_RET(__pack_size(session, &pv, &len));
				offset = WT_PTRDIFF(p, buf->data);
				/*
				 * Avoid growing the buffer if the value fits.
				 * This is not just a performance issue: it
				 * covers the case of record number keys, which
				 * have to be written to cursor->recno.
				 */
				if (len > old_len)
					WT_RET(__wt_buf_grow(session,
					    buf, buf->size + len - old_len));
				p = (uint8_t *)buf->data + offset;
				/* Make room if we're inserting out-of-order. */
				if (offset + old_len < buf->size)
					memmove(p + len, p + old_len,
					    buf->size - (offset + old_len));
				WT_RET(__pack_write(session, &pv, &p, len));
				buf->size += len - old_len;
				end = (uint8_t *)buf->data + buf->size;
				break;
			default:
				WT_RET_MSG(session, EINVAL,
				    "unexpected projection plan: %c",
				    (int)*proj);
			}
		}
	}

	return (0);
}
Example #18
0
/*
 * __wt_struct_plan --
 *	Given a table cursor containing a complete table, build the "projection
 *	plan" to distribute the columns to dependent stores.  A string
 *	representing the plan will be appended to the plan buffer.
 */
int
__wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
    const char *columns, size_t len, int value_only, WT_ITEM *plan)
{
	WT_BTREE *saved_btree;
	WT_CONFIG conf;
	WT_CONFIG_ITEM k, v;
	WT_DECL_RET;
	int cg, col, current_cg, current_col, start_cg, start_col;
	int i, have_it;
	char coltype, current_coltype;

	saved_btree = session->btree;
	start_cg = start_col = -1;      /* -Wuninitialized */

	/* Work through the value columns by skipping over the key columns. */
	WT_ERR(__wt_config_initn(session, &conf, columns, len));

	if (value_only)
		for (i = 0; i < table->nkey_columns; i++)
			WT_ERR(__wt_config_next(&conf, &k, &v));

	current_cg = cg = 0;
	current_col = col = INT_MAX;
	current_coltype = coltype = WT_PROJ_KEY; /* Keep lint quiet. */
	while (__wt_config_next(&conf, &k, &v) == 0) {
		have_it = 0;

		while (__find_next_col(session, table,
		    &k, &cg, &col, &coltype) == 0 &&
		    (!have_it || cg != start_cg || col != start_col)) {
			/*
			 * First we move to the column.  If that is in a
			 * different column group to the last column we
			 * accessed, or before the last column in the same
			 * column group, or moving from the key to the value,
			 * we need to switch column groups or rewind.
			 */
			if (current_cg != cg || current_col > col ||
			    current_coltype != coltype) {
				WT_ASSERT(session, !value_only ||
				    coltype == WT_PROJ_VALUE);
				WT_ERR(__wt_buf_catfmt(
				    session, plan, "%d%c", cg, coltype));

				/*
				 * Set the current column group and column
				 * within the table.
				 */
				current_cg = cg;
				current_col = 0;
				current_coltype = coltype;
			}
			/* Now move to the column we want. */
			if (current_col < col) {
				if (col - current_col > 1)
					WT_ERR(__wt_buf_catfmt(session,
					    plan, "%d", col - current_col));
				WT_ERR(__wt_buf_catfmt(session,
				    plan, "%c", WT_PROJ_SKIP));
			}
			/*
			 * Now copy the value in / out.  In the common case,
			 * where each value is used in one column, we do a
			 * "next" operation.  If the value is used again, we do
			 * a "reuse" operation to avoid making another copy.
			 */
			if (!have_it) {
				WT_ERR(__wt_buf_catfmt(session,
				    plan, "%c", WT_PROJ_NEXT));

				start_cg = cg;
				start_col = col;
				have_it = 1;
			} else
				WT_ERR(__wt_buf_catfmt(session,
				    plan, "%c", WT_PROJ_REUSE));
			current_col = col + 1;
		}
	}

err:	session->btree = saved_btree;
	return (ret);
}
Example #19
0
/*
 * __wt_col_modify --
 *	Column-store delete, insert, and update.
 */
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
    uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_INSERT *ins;
	WT_INSERT_HEAD *ins_head, **ins_headp;
	WT_ITEM _value;
	WT_PAGE *page;
	WT_UPDATE *old_upd;
	size_t ins_size, upd_size;
	u_int i, skipdepth;
	int append, logged;

	btree = cbt->btree;
	ins = NULL;
	page = cbt->ref->page;
	append = logged = 0;

	/* This code expects a remove to have a NULL value. */
	if (is_remove) {
		if (btree->type == BTREE_COL_FIX) {
			value = &_value;
			value->data = "";
			value->size = 1;
		} else
			value = NULL;
	} else {
		/*
		 * There's some chance the application specified a record past
		 * the last record on the page.  If that's the case, and we're
		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
		 * append list, not the update list. In addition, a recno of 0
		 * implies an append operation, we're allocating a new row.
		 */
		if (recno == 0 ||
		    recno > (btree->type == BTREE_COL_VAR ?
		    __col_var_last_recno(page) : __col_fix_last_recno(page)))
			append = 1;
	}

	/* If we don't yet have a modify structure, we'll need one. */
	WT_RET(__wt_page_modify_init(session, page));

	/*
	 * Delete, insert or update a column-store entry.
	 *
	 * If modifying a previously modified record, create a new WT_UPDATE
	 * entry and have a serialized function link it into an existing
	 * WT_INSERT entry's WT_UPDATE list.
	 *
	 * Else, allocate an insert array as necessary, build a WT_INSERT and
	 * WT_UPDATE structure pair, and call a serialized function to insert
	 * the WT_INSERT structure.
	 */
	if (cbt->compare == 0 && cbt->ins != NULL) {
		/*
		 * If we are restoring updates that couldn't be evicted, the
		 * key must not exist on the new page.
		 */
		WT_ASSERT(session, upd == NULL);

		/* Make sure the update can proceed. */
		WT_ERR(__wt_txn_update_check(
		    session, old_upd = cbt->ins->upd));

		/* Allocate a WT_UPDATE structure and transaction ID. */
		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
		WT_ERR(__wt_txn_modify(session, upd));
		logged = 1;

		/* Avoid a data copy in WT_CURSOR.update. */
		cbt->modify_update = upd;

		/*
		 * Point the new WT_UPDATE item to the next element in the list.
		 * If we get it right, the serialization function lock acts as
		 * our memory barrier to flush this write.
		 */
		upd->next = old_upd;

		/* Serialize the update. */
		WT_ERR(__wt_update_serial(
		    session, page, &cbt->ins->upd, &upd, upd_size));
	} else {
		/* Allocate the append/update list reference as necessary. */
		if (append) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, page->modify->mod_append, ins_headp, 1);
			ins_headp = &page->modify->mod_append[0];
		} else if (page->type == WT_PAGE_COL_FIX) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, page->modify->mod_update, ins_headp, 1);
			ins_headp = &page->modify->mod_update[0];
		} else {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, page->modify->mod_update, ins_headp,
			    page->pg_var_entries);
			ins_headp = &page->modify->mod_update[cbt->slot];
		}

		/* Allocate the WT_INSERT_HEAD structure as necessary. */
		WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
		ins_head = *ins_headp;

		/* Choose a skiplist depth for this insert. */
		skipdepth = __wt_skip_choose_depth(session);

		/*
		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
		 * update the cursor to reference it (the WT_INSERT_HEAD might
		 * be allocated, the WT_INSERT was allocated).
		 */
		WT_ERR(__col_insert_alloc(
		    session, recno, skipdepth, &ins, &ins_size));
		cbt->ins_head = ins_head;
		cbt->ins = ins;

		if (upd == NULL) {
			WT_ERR(
			    __wt_update_alloc(session, value, &upd, &upd_size));
			WT_ERR(__wt_txn_modify(session, upd));
			logged = 1;

			/* Avoid a data copy in WT_CURSOR.update. */
			cbt->modify_update = upd;
		} else
			upd_size = __wt_update_list_memsize(upd);
		ins->upd = upd;
		ins_size += upd_size;

		/*
		 * If there was no insert list during the search, or there was
		 * no search because the record number has not been allocated
		 * yet, the cursor's information cannot be correct, search
		 * couldn't have initialized it.
		 *
		 * Otherwise, point the new WT_INSERT item's skiplist to the
		 * next elements in the insert list (which we will check are
		 * still valid inside the serialization function).
		 *
		 * The serial mutex acts as our memory barrier to flush these
		 * writes before inserting them into the list.
		 */
		if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0)
			for (i = 0; i < skipdepth; i++) {
				cbt->ins_stack[i] = &ins_head->head[i];
				ins->next[i] = cbt->next_stack[i] = NULL;
			}
		else
			for (i = 0; i < skipdepth; i++)
				ins->next[i] = cbt->next_stack[i];

		/* Append or insert the WT_INSERT structure. */
		if (append)
			WT_ERR(__wt_col_append_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, &cbt->recno, skipdepth));
		else
			WT_ERR(__wt_insert_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, skipdepth));
	}

	/* If the update was successful, add it to the in-memory log. */
	if (logged)
		WT_ERR(__wt_txn_log_op(session, cbt));

	if (0) {
err:		/*
		 * Remove the update from the current transaction, so we don't
		 * try to modify it on rollback.
		 */
		if (logged)
			__wt_txn_unmodify(session);
		__wt_free(session, ins);
		__wt_free(session, upd);
	}

	return (ret);
}
Example #20
0
File: bt_page.c Project: 3rf/mongo
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_DECL_RET;
	WT_PAGE *page;
	int busy, force_attempts, oldgen;

	for (force_attempts = oldgen = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, attempt to read it.
			 * Make sure there is space in the cache.
			 */
			WT_RET(__wt_cache_full_check(session));
			WT_RET(__wt_cache_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			/* FALLTHROUGH */
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			/* The page is busy -- wait. */
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory: get a hazard pointer, update
			 * the page's LRU and return.  The expected reason we
			 * can't get a hazard pointer is because the page is
			 * being evicted; yield and try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy)
				break;

			page = ref->page;
			WT_ASSERT(session, page != NULL);

			/* Forcibly evict pages that are too big. */
			if (!LF_ISSET(WT_READ_NO_EVICT) &&
			    force_attempts < 10 &&
			    __evict_force_check(session, page)) {
				++force_attempts;
				WT_RET(__wt_page_release(session, ref, flags));
				break;
			}

			/* Check if we need an autocommit transaction. */
			if ((ret = __wt_txn_autocommit_check(session)) != 0) {
				WT_TRET(__wt_hazard_clear(session, page));
				return (ret);
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_set(session);

			return (0);
		WT_ILLEGAL_VALUE(session);
		}

		/* We failed to get the page -- yield before retrying. */
		__wt_yield();
	}
}
Example #21
0
/*
 * __curjoin_iter_set_entry --
 *	Set the current entry for an iterator.
 */
static int
__curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos)
{
	WT_CURSOR *c, *to_dup;
	WT_CURSOR_JOIN *cjoin, *topjoin;
	WT_CURSOR_JOIN_ENTRY *entry;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	size_t size;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    iter->session, WT_SESSION_open_cursor), "raw", NULL };
	const char *def_cfg[] = { WT_CONFIG_BASE(
	    iter->session, WT_SESSION_open_cursor), NULL };
	const char **config;
	char *uri;

	session = iter->session;
	cjoin = iter->cjoin;
	uri = NULL;
	entry = iter->entry = &cjoin->entries[entry_pos];
	iter->positioned = false;
	iter->entry_pos = entry_pos;
	iter->end_pos = 0;

	iter->is_equal = (entry->ends_next == 1 &&
	    WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ);
	iter->end_skip = (entry->ends_next > 0 &&
	    WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_GE) ? 1 : 0;

	iter->end_count = WT_MIN(1, entry->ends_next);
	if (F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) {
		iter->entry_count = cjoin->entries_next;
		if (iter->is_equal)
			iter->end_count = entry->ends_next;
	} else
		iter->entry_count = 1;
	WT_ASSERT(iter->session, iter->entry_pos < iter->entry_count);

	entry->stats.iterated = 0;

	if (entry->subjoin == NULL) {
		for (topjoin = iter->cjoin; topjoin->parent != NULL;
		     topjoin = topjoin->parent)
			;
		to_dup = entry->ends[0].cursor;

		if (F_ISSET((WT_CURSOR *)topjoin, WT_CURSTD_RAW))
			config = &raw_cfg[0];
		else
			config = &def_cfg[0];

		size = strlen(to_dup->internal_uri) + 3;
		WT_ERR(__wt_calloc(session, size, 1, &uri));
		WT_ERR(__wt_snprintf(uri, size, "%s()", to_dup->internal_uri));
		if ((c = iter->cursor) == NULL || strcmp(c->uri, uri) != 0) {
			iter->cursor = NULL;
			if (c != NULL)
				WT_ERR(c->close(c));
			WT_ERR(__wt_open_cursor(session, uri,
			    (WT_CURSOR *)topjoin, config, &iter->cursor));
		}
		WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor));
	} else if (iter->cursor != NULL) {
		WT_ERR(iter->cursor->close(iter->cursor));
		iter->cursor = NULL;
	}

err:	__wt_free(session, uri);
	return (ret);
}
Example #22
0
/*
 * __wt_btcur_prev --
 *	Move to the previous record in the tree.
 */
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	uint32_t flags;
	int newpage;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	WT_DSTAT_INCR(session, cursor_prev);

	flags = WT_TREE_SKIP_INTL | WT_TREE_PREV;	/* Tree walk flags. */
	if (discard)
		LF_SET(WT_TREE_DISCARD);

retry:	WT_RET(__cursor_func_init(cbt, 0));
	__cursor_position_clear(cbt);

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
		__wt_btcur_iterate_setup(cbt, 0);

	/*
	 * If this is a modification, we're about to read information from the
	 * page, save the write generation.
	 */
	page = cbt->page;
	if (discard && page != NULL) {
		WT_ERR(__wt_page_modify_init(session, page));
		WT_ORDERED_READ(cbt->write_gen, page->modify->write_gen);
	}

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the previous page, until we reach the start
	 * of the file.
	 */
	for (newpage = 0;; newpage = 1) {
		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
			newpage = 1;
		}
		if (page != NULL) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_prev(cbt, newpage);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;
		}

		cbt->page = NULL;
		WT_ERR(__wt_tree_walk(session, &page, flags));
		WT_ERR_TEST(page == NULL, WT_NOTFOUND);
		WT_ASSERT(session,
		    page->type != WT_PAGE_COL_INT &&
		    page->type != WT_PAGE_ROW_INT);
		cbt->page = page;

		/* Initialize the page's modification information */
		if (discard) {
			WT_ERR(__wt_page_modify_init(session, page));
			WT_ORDERED_READ(
			    cbt->write_gen, page->modify->write_gen);
		}

		/*
		 * The last page in a column-store has appended entries.
		 * We handle it separately from the usual cursor code:
		 * it's only that one page and it's in a simple format.
		 */
		if (page->type != WT_PAGE_ROW_LEAF &&
		    (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
			F_SET(cbt, WT_CBT_ITERATE_APPEND);
	}

err:	if (ret == WT_RESTART)
		goto retry;
	WT_TRET(__cursor_func_resolve(cbt, ret));
	return (ret);
}
Example #23
0
/*
 * __curjoin_entry_member --
 *	Do a membership check for a particular index that was joined,
 *	if not a member, returns WT_NOTFOUND.
 */
static int
__curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
    WT_ITEM *key, WT_CURSOR_JOIN_ITER *iter)
{
	WT_CURJOIN_EXTRACTOR extract_cursor;
	WT_CURSOR *c;
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __wt_cursor_compare_notsup,		/* compare */
	    __wt_cursor_equals_notsup,		/* equals */
	    __wt_cursor_notsup,			/* next */
	    __wt_cursor_notsup,			/* prev */
	    __wt_cursor_notsup,			/* reset */
	    __wt_cursor_notsup,			/* search */
	    __wt_cursor_search_near_notsup,	/* search-near */
	    __curjoin_extract_insert,		/* insert */
	    __wt_cursor_modify_notsup,		/* modify */
	    __wt_cursor_notsup,			/* update */
	    __wt_cursor_notsup,			/* remove */
	    __wt_cursor_notsup,			/* reserve */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __wt_cursor_notsup,			/* cache */
	    __wt_cursor_reopen_notsup,		/* reopen */
	    __wt_cursor_notsup);		/* close */
	WT_DECL_RET;
	WT_INDEX *idx;
	WT_ITEM v;
	bool bloom_found;

	if (entry->subjoin == NULL && iter != NULL &&
	    (iter->end_pos + iter->end_skip >= entry->ends_next ||
	    (iter->end_skip > 0 &&
	    F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))))
		return (0);	/* no checks to make */

	entry->stats.membership_check++;
	bloom_found = false;

	if (entry->bloom != NULL) {
		/*
		 * If the item is not in the Bloom filter, we return
		 * immediately, otherwise, we still may need to check the
		 * long way, since it may be a false positive.
		 *
		 * If we don't own the Bloom filter, we must be sharing one
		 * in a previous entry. So the shared filter has already
		 * been checked and passed, we don't need to check it again.
		 * We'll still need to check the long way.
		 */
		if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
			WT_ERR(__wt_bloom_inmem_get(entry->bloom, key));
		if (F_ISSET(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES))
			return (0);
		bloom_found = true;
	}
	if (entry->subjoin != NULL) {
		WT_ASSERT(session,
		    iter == NULL || entry->subjoin == iter->child->cjoin);
		ret = __curjoin_entries_in_range(session, entry->subjoin,
		    key, iter == NULL ? NULL : iter->child);
		if (iter != NULL &&
		    WT_CURJOIN_ITER_CONSUMED(iter->child)) {
			WT_ERR(__curjoin_iter_bump(iter));
			ret = WT_NOTFOUND;
		}
		return (ret);
	}
	if (entry->index != NULL) {
		/*
		 * If this entry is used by the iterator, then we already
		 * have the index key, and we won't have to do any
		 * extraction either.
		 */
		if (iter != NULL && entry == iter->entry)
			WT_ITEM_SET(v, iter->idxkey);
		else {
			memset(&v, 0, sizeof(v));	/* Keep lint quiet. */
			c = entry->main;
			c->set_key(c, key);
			entry->stats.main_access++;
			if ((ret = c->search(c)) == 0)
				ret = c->get_value(c, &v);
			else if (ret == WT_NOTFOUND) {
				__wt_err(session, ret,
				    "main table for join is missing entry");
				ret = WT_ERROR;
			}
			WT_TRET(c->reset(c));
			WT_ERR(ret);
		}
	} else
		WT_ITEM_SET(v, *key);

	if ((idx = entry->index) != NULL && idx->extractor != NULL &&
	    (iter == NULL || entry != iter->entry)) {
		WT_CLEAR(extract_cursor);
		extract_cursor.iface = iface;
		extract_cursor.iface.session = &session->iface;
		extract_cursor.iface.key_format = idx->exkey_format;
		extract_cursor.ismember = false;
		extract_cursor.entry = entry;
		WT_ERR(idx->extractor->extract(idx->extractor,
		    &session->iface, key, &v, &extract_cursor.iface));
		__wt_buf_free(session, &extract_cursor.iface.key);
		__wt_buf_free(session, &extract_cursor.iface.value);
		if (!extract_cursor.ismember)
			WT_ERR(WT_NOTFOUND);
	} else
		WT_ERR(__curjoin_entry_in_range(session, entry, &v, iter));

	if (0) {
err:		if (ret == WT_NOTFOUND && bloom_found)
			entry->stats.bloom_false_positive++;
	}
	return (ret);
}
Example #24
0
/*
 * __wt_txn_get_snapshot --
 *	Allocate a snapshot.
 */
void
__wt_txn_get_snapshot(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *s, *txn_state;
	uint64_t current_id, id;
	uint64_t prev_oldest_id, snap_min;
	uint32_t i, n, session_cnt;
	int32_t count;

	conn = S2C(session);
	txn = &session->txn;
	txn_global = &conn->txn_global;
	txn_state = WT_SESSION_TXN_STATE(session);

	/*
	 * We're going to scan.  Increment the count of scanners to prevent the
	 * oldest ID from moving forwards.  Spin if the count is negative,
	 * which indicates that some thread is moving the oldest ID forwards.
	 */
	do {
		if ((count = txn_global->scan_count) < 0)
			WT_PAUSE();
	} while (count < 0 ||
	    !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1));

	current_id = snap_min = txn_global->current;
	prev_oldest_id = txn_global->oldest_id;

	/* For pure read-only workloads, avoid scanning. */
	if (prev_oldest_id == current_id) {
		txn_state->snap_min = current_id;
		__txn_sort_snapshot(session, 0, current_id);

		/* Check that the oldest ID has not moved in the meantime. */
		if (prev_oldest_id == txn_global->oldest_id) {
			WT_ASSERT(session, txn_global->scan_count > 0);
			(void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
			return;
		}
	}

	/* Walk the array of concurrent transactions. */
	WT_ORDERED_READ(session_cnt, conn->session_cnt);
	for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) {
		/*
		 * Build our snapshot of any concurrent transaction IDs.
		 *
		 * Ignore:
		 *  - Our own ID: we always read our own updates.
		 *  - The ID if it is older than the oldest ID we saw. This
		 *    can happen if we race with a thread that is allocating
		 *    an ID -- the ID will not be used because the thread will
		 *    keep spinning until it gets a valid one.
		 */
		if (s != txn_state &&
		    (id = s->id) != WT_TXN_NONE &&
		    WT_TXNID_LE(prev_oldest_id, id)) {
			txn->snapshot[n++] = id;
			if (WT_TXNID_LT(id, snap_min))
				snap_min = id;
		}
	}

	/*
	 * If we got a new snapshot, update the published snap_min for this
	 * session.
	 */
	WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min));
	WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
	txn_state->snap_min = snap_min;

	WT_ASSERT(session, txn_global->scan_count > 0);
	(void)__wt_atomic_subiv32(&txn_global->scan_count, 1);

	__txn_sort_snapshot(session, n, current_id);
}
Example #25
0
/*
 * __curjoin_init_next --
 *	Initialize the cursor join when the next function is first called.
 */
static int
__curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    bool iterable)
{
	WT_BLOOM *bloom;
	WT_CURSOR *origcur;
	WT_CURSOR_JOIN_ENDPOINT *end;
	WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2;
	WT_DECL_RET;
	size_t size;
	uint32_t f, k;
	char *mainbuf;
	const char *def_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), NULL };
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };
	const char **config, *proj, *urimain;

	mainbuf = NULL;
	if (cjoin->entries_next == 0)
		WT_RET_MSG(session, EINVAL,
		    "join cursor has not yet been joined with any other "
		    "cursors");

	/* Get a consistent view of our subordinate cursors if appropriate. */
	__wt_txn_cursor_op(session);

	if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
		config = &raw_cfg[0];
	else
		config = &def_cfg[0];
	urimain = cjoin->table->iface.name;
	if ((proj = cjoin->projection) != NULL) {
		size = strlen(urimain) + strlen(proj) + 1;
		WT_ERR(__wt_calloc(session, size, 1, &mainbuf));
		WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj));
		urimain = mainbuf;
	}
	WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config,
	    &cjoin->main));

	jeend = &cjoin->entries[cjoin->entries_next];
	for (je = cjoin->entries; je < jeend; je++) {
		if (je->subjoin != NULL) {
			WT_ERR(__curjoin_init_next(session, je->subjoin,
			    iterable));
			continue;
		}
		__wt_stat_join_init_single(&je->stats);
		/*
		 * For a single compare=le/lt endpoint in any entry that may
		 * be iterated, construct a companion compare=ge endpoint
		 * that will actually be iterated.
		 */
		if (iterable && je->ends_next == 1 &&
		    F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) {
			origcur = je->ends[0].cursor;
			WT_ERR(__curjoin_insert_endpoint(session, je, 0, &end));
			WT_ERR(__wt_open_cursor(session, origcur->uri,
			    (WT_CURSOR *)cjoin,
			    F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg,
			    &end->cursor));
			end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ |
			    WT_CURJOIN_END_OWN_CURSOR;
			WT_ERR(end->cursor->next(end->cursor));
			F_CLR(je, WT_CURJOIN_ENTRY_DISJUNCTION);
		}
		for (end = &je->ends[0]; end < &je->ends[je->ends_next];
		     end++)
			WT_ERR(__curjoin_endpoint_init_key(session, je, end));

		/*
		 * Do any needed Bloom filter initialization.  Ignore Bloom
		 * filters for entries that will be iterated.  They won't
		 * help since these entries either don't need an inclusion
		 * check or are doing any needed check during the iteration.
		 */
		if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
			if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
			       WT_ERR_MSG(session, EINVAL,
				    "join cursors with Bloom filters cannot be "
				    "used with read-uncommitted isolation");
			if (je->bloom == NULL) {
				/*
				 * Look for compatible filters to be shared,
				 * pick compatible numbers for bit counts
				 * and number of hashes.
				 */
				f = je->bloom_bit_count;
				k = je->bloom_hash_count;
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						f = WT_MAX(
						    je2->bloom_bit_count, f);
						k = WT_MAX(
						    je2->bloom_hash_count, k);
					}
				je->bloom_bit_count = f;
				je->bloom_hash_count = k;
				WT_ERR(__wt_bloom_create(session, NULL,
				    NULL, je->count, f, k, &je->bloom));
				F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM);
				WT_ERR(__curjoin_init_bloom(session, cjoin,
				    je, je->bloom));
				/*
				 * Share the Bloom filter, making all
				 * config info consistent.
				 */
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						WT_ASSERT(session,
						    je2->bloom == NULL);
						je2->bloom = je->bloom;
						je2->bloom_bit_count = f;
						je2->bloom_hash_count = k;
					}
			} else {
				/*
				 * Create a temporary filter that we'll
				 * merge into the shared one.  The Bloom
				 * parameters of the two filters must match.
				 */
				WT_ERR(__wt_bloom_create(session, NULL,
				    NULL, je->count, je->bloom_bit_count,
				    je->bloom_hash_count, &bloom));
				WT_ERR(__curjoin_init_bloom(session, cjoin,
				    je, bloom));
				WT_ERR(__wt_bloom_intersection(je->bloom,
				    bloom));
				WT_ERR(__wt_bloom_close(bloom));
			}
		}
		if (!F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
			iterable = false;
	}
	F_SET(cjoin, WT_CURJOIN_INITIALIZED);

err:	__wt_free(session, mainbuf);
	return (ret);
}
Example #26
0
/*
 * __wt_txn_update_oldest --
 *	Sweep the running transactions to update the oldest ID required.
 * !!!
 * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
 * method (for the oldest transaction ID not yet visible to a running
 * transaction), and then comparing that oldest ID against committed
 * transactions to see if updates for a committed transaction are still
 * visible to running transactions, the oldest transaction ID may be
 * the same as the last committed transaction ID, if the transaction
 * state wasn't refreshed after the last transaction committed.  Push
 * past the last committed transaction.
*/
void
__wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force)
{
	WT_CONNECTION_IMPL *conn;
	WT_SESSION_IMPL *oldest_session;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *s;
	uint64_t current_id, id, last_running, oldest_id, prev_oldest_id;
	uint32_t i, session_cnt;
	int32_t count;
	bool last_running_moved;

	conn = S2C(session);
	txn_global = &conn->txn_global;

retry:
	current_id = last_running = txn_global->current;
	oldest_session = NULL;
	prev_oldest_id = txn_global->oldest_id;

	/*
	 * For pure read-only workloads, or if the update isn't forced and the
	 * oldest ID isn't too far behind, avoid scanning.
	 */
	if (prev_oldest_id == current_id ||
	    (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
		return;

	/*
	 * We're going to scan.  Increment the count of scanners to prevent the
	 * oldest ID from moving forwards.  Spin if the count is negative,
	 * which indicates that some thread is moving the oldest ID forwards.
	 */
	do {
		if ((count = txn_global->scan_count) < 0)
			WT_PAUSE();
	} while (count < 0 ||
	    !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1));

	/* The oldest ID cannot change until the scan count goes to zero. */
	prev_oldest_id = txn_global->oldest_id;
	current_id = oldest_id = last_running = txn_global->current;

	/* Walk the array of concurrent transactions. */
	WT_ORDERED_READ(session_cnt, conn->session_cnt);
	for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
		/*
		 * Update the oldest ID.
		 *
		 * Ignore: IDs older than the oldest ID we saw. This can happen
		 * if we race with a thread that is allocating an ID -- the ID
		 * will not be used because the thread will keep spinning until
		 * it gets a valid one.
		 */
		if ((id = s->id) != WT_TXN_NONE &&
		    WT_TXNID_LE(prev_oldest_id, id) &&
		    WT_TXNID_LT(id, last_running))
			last_running = id;

		/*
		 * !!!
		 * Note: Don't ignore snap_min values older than the previous
		 * oldest ID.  Read-uncommitted operations publish snap_min
		 * values without incrementing scan_count to protect the global
		 * table.  See the comment in __wt_txn_cursor_op for
		 * more details.
		 */
		if ((id = s->snap_min) != WT_TXN_NONE &&
		    WT_TXNID_LT(id, oldest_id)) {
			oldest_id = id;
			oldest_session = &conn->sessions[i];
		}
	}

	if (WT_TXNID_LT(last_running, oldest_id))
		oldest_id = last_running;

	/* The oldest ID can't move past any named snapshots. */
	if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE &&
	    WT_TXNID_LT(id, oldest_id))
		oldest_id = id;

	/* Update the last running ID. */
	last_running_moved =
	    WT_TXNID_LT(txn_global->last_running, last_running);

	/* Update the oldest ID. */
	if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) {
		/*
		 * We know we want to update.  Check if we're racing.
		 */
		if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
			WT_ORDERED_READ(session_cnt, conn->session_cnt);
			for (i = 0, s = txn_global->states;
			    i < session_cnt; i++, s++) {
				if ((id = s->id) != WT_TXN_NONE &&
				WT_TXNID_LT(id, last_running))
					last_running = id;
				if ((id = s->snap_min) != WT_TXN_NONE &&
				WT_TXNID_LT(id, oldest_id))
					oldest_id = id;
			}

			if (WT_TXNID_LT(last_running, oldest_id))
				oldest_id = last_running;

#ifdef HAVE_DIAGNOSTIC
			/*
			 * Make sure the ID doesn't move past any named
			 * snapshots.
			 *
			 * Don't include the read/assignment in the assert
			 * statement.  Coverity complains if there are
			 * assignments only done in diagnostic builds, and
			 * when the read is from a volatile.
			 */
			id = txn_global->nsnap_oldest_id;
			WT_ASSERT(session,
			    id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
#endif
			if (WT_TXNID_LT(txn_global->last_running, last_running))
				txn_global->last_running = last_running;
			if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
				txn_global->oldest_id = oldest_id;
			WT_ASSERT(session, txn_global->scan_count == -1);
			txn_global->scan_count = 0;
		} else {
			/*
			 * We wanted to update the oldest ID but we're racing
			 * another thread.  Retry if this is a forced update.
			 */
			WT_ASSERT(session, txn_global->scan_count > 0);
			(void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
			if (force) {
				__wt_yield();
				goto retry;
			}
		}
	} else {
		if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
		    current_id - oldest_id > 10000 && oldest_session != NULL) {
			(void)__wt_verbose(session, WT_VERB_TRANSACTION,
			    "old snapshot %" PRIu64
			    " pinned in session %d [%s]"
			    " with snap_min %" PRIu64 "\n",
			    oldest_id, oldest_session->id,
			    oldest_session->lastop,
			    oldest_session->txn.snap_min);
		}
		WT_ASSERT(session, txn_global->scan_count > 0);
		(void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
	}
}
Example #27
0
/*
 * __wt_evict_file --
 *	Discard pages for a specific file.
 */
int
__wt_evict_file(WT_SESSION_IMPL *session, int syncop)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *next_ref, *ref;
	bool evict_reset;

	/*
	 * We need exclusive access to the file -- disable ordinary eviction
	 * and drain any blocks already queued.
	 */
	WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset));

	/* Make sure the oldest transaction ID is up-to-date. */
	__wt_txn_update_oldest(session, true);

	/* Walk the tree, discarding pages. */
	next_ref = NULL;
	WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
	    WT_READ_CACHE | WT_READ_NO_EVICT));
	while ((ref = next_ref) != NULL) {
		page = ref->page;

		/*
		 * Eviction can fail when a page in the evicted page's subtree
		 * switches state.  For example, if we don't evict a page marked
		 * empty, because we expect it to be merged into its parent, it
		 * might no longer be empty after it's reconciled, in which case
		 * eviction of its parent would fail.  We can either walk the
		 * tree multiple times (until it's finally empty), or reconcile
		 * each page to get it to its final state before considering if
		 * it's an eviction target or will be merged into its parent.
		 *
		 * Don't limit this test to any particular page type, that tends
		 * to introduce bugs when the reconciliation of other page types
		 * changes, and there's no advantage to doing so.
		 *
		 * Eviction can also fail because an update cannot be written.
		 * If sessions have disjoint sets of files open, updates in a
		 * no-longer-referenced file may not yet be globally visible,
		 * and the write will fail with EBUSY.  Our caller handles that
		 * error, retrying later.
		 */
		if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
			WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING));

		/*
		 * We can't evict the page just returned to us (it marks our
		 * place in the tree), so move the walk to one page ahead of
		 * the page being evicted.  Note, we reconciled the returned
		 * page first: if reconciliation of that page were to change
		 * the shape of the tree, and we did the next walk call before
		 * the reconciliation, the next walk call could miss a page in
		 * the tree.
		 */
		WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
		    WT_READ_CACHE | WT_READ_NO_EVICT));

		switch (syncop) {
		case WT_SYNC_CLOSE:
			/*
			 * Evict the page.
			 */
			WT_ERR(__wt_evict(session, ref, 1));
			break;
		case WT_SYNC_DISCARD:
			/*
			 * Dead handles may reference dirty pages; clean the
			 * page, both to keep statistics correct, and to let
			 * the page-discard function assert no dirty page is
			 * ever discarded.
			 */
			if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
				__wt_page_modify_clear(session, page);

			WT_ASSERT(session,
			    F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
			    __wt_page_can_evict(session, ref, false, NULL));
			__wt_evict_page_clean_update(session, ref, 1);
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}
	}

	if (0) {
err:		/* On error, clear any left-over tree walk. */
		if (next_ref != NULL)
			WT_TRET(__wt_page_release(
			    session, next_ref, WT_READ_NO_EVICT));
	}

	if (evict_reset)
		__wt_evict_file_exclusive_off(session);

	return (ret);
}
Example #28
0
/*
 * __wt_txn_commit --
 *	Commit the current transaction.
 */
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_OP *op;
	u_int i;

	txn = &session->txn;
	conn = S2C(session);
	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);

	if (!F_ISSET(txn, WT_TXN_RUNNING))
		WT_RET_MSG(session, EINVAL, "No transaction is active");

	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 */
	WT_RET(__wt_config_gets_def(session, cfg, "sync", 0, &cval));

	/*
	 * If the user chose the default setting, check whether sync is enabled
	 * for this transaction (either inherited or via begin_transaction).
	 * If sync is disabled, clear the field to avoid the log write being
	 * flushed.
	 *
	 * Otherwise check for specific settings.  We don't need to check for
	 * "on" because that is the default inherited from the connection.  If
	 * the user set anything in begin_transaction, we only override with an
	 * explicit setting.
	 */
	if (cval.len == 0) {
		if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
		    !F_ISSET(txn, WT_TXN_SYNC_SET))
			txn->txn_logsync = 0;
	} else {
		/*
		 * If the caller already set sync on begin_transaction then
		 * they should not be using sync on commit_transaction.
		 * Flag that as an error.
		 */
		if (F_ISSET(txn, WT_TXN_SYNC_SET))
			WT_RET_MSG(session, EINVAL,
			    "Sync already set during begin_transaction.");
		if (WT_STRING_MATCH("background", cval.str, cval.len))
			txn->txn_logsync = WT_LOG_BACKGROUND;
		else if (WT_STRING_MATCH("off", cval.str, cval.len))
			txn->txn_logsync = 0;
		/*
		 * We don't need to check for "on" here because that is the
		 * default to inherit from the connection setting.
		 */
	}

	/* Commit notification. */
	if (txn->notify != NULL)
		WT_TRET(txn->notify->notify(txn->notify,
		    (WT_SESSION *)session, txn->id, 1));

	/* If we are logging, write a commit log record. */
	if (ret == 0 && txn->mod_count > 0 &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
		/*
		 * We are about to block on I/O writing the log.
		 * Release our snapshot in case it is keeping data pinned.
		 * This is particularly important for checkpoints.
		 */
		__wt_txn_release_snapshot(session);
		ret = __wt_txn_log_commit(session, cfg);
	}

	/*
	 * If anything went wrong, roll back.
	 *
	 * !!!
	 * Nothing can fail after this point.
	 */
	if (ret != 0) {
		WT_TRET(__wt_txn_rollback(session, cfg));
		return (ret);
	}

	/* Free memory associated with updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
		__wt_txn_op_free(session, op);
	txn->mod_count = 0;

	/*
	 * We are about to release the snapshot: copy values into any
	 * positioned cursors so they don't point to updates that could be
	 * freed once we don't have a transaction ID pinned.
	 */
	if (session->ncursors > 0)
		WT_RET(__wt_session_copy_values(session));

	__wt_txn_release(session);
	return (0);
}
Example #29
0
/*
 * __wt_block_write_off --
 *	Write a buffer into a block, returning the block's addr/size and
 * checksum.
 */
int
__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
    int data_cksum, int locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	off_t offset;
	uint32_t align_size;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	fh = block->fh;

	/* Buffers should be aligned for writing. */
	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL,
		    "direct I/O check: write buffer incorrectly allocated");
	}

	/*
	 * Align the size to an allocation unit.
	 *
	 * The buffer must be big enough for us to zero to the next allocsize
	 * boundary, this is one of the reasons the btree layer must find out
	 * from the block-manager layer the maximum size of the eventual write.
	 */
	align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer incorrectly allocated");
	}

	/* Zero out any unused bytes at the end of the buffer. */
	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);

	/*
	 * Set the disk size so we don't have to incrementally read blocks
	 * during salvage.
	 */
	blk->disk_size = align_size;

	/*
	 * Update the block's checksum: if our caller specifies, checksum the
	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
	 * bytes.  The assumption is applications with good compression support
	 * turn off checksums and assume corrupted blocks won't decompress
	 * correctly.  However, if compression failed to shrink the block, the
	 * block wasn't compressed, in which case our caller will tell us to
	 * checksum the data to detect corruption.   If compression succeeded,
	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
	 * because they're not compressed, both to give salvage a quick test
	 * of whether a block is useful and to give us a test so we don't lose
	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
	 */
	blk->flags = 0;
	if (data_cksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);
	blk->cksum = 0;
	blk->cksum = __wt_cksum(
	    buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);

	if (!locked)
		__wt_spin_lock(session, &block->live_lock);
	ret = __wt_block_alloc(session, block, &offset, (off_t)align_size);
	if (!locked)
		__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

#if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE)
	/*
	 * Extend the file in chunks.  We aren't holding a lock and we'd prefer
	 * to limit the number of threads extending the file at the same time,
	 * so choose the one thread that's crossing the extended boundary.  We
	 * don't extend newly created files, and it's theoretically possible we
	 * might wait so long our extension of the file is passed by another
	 * thread writing single blocks, that's why there's a check in case the
	 * extended file size becomes too small: if the file size catches up,
	 * every thread will try to extend it.
	 */
	if (fh->extend_len != 0 &&
	    (fh->extend_size <= fh->size ||
	    (offset + fh->extend_len <= fh->extend_size &&
	    offset + fh->extend_len + align_size >= fh->extend_size))) {
		fh->extend_size = offset + fh->extend_len * 2;
#if defined(HAVE_POSIX_FALLOCATE)
		if ((ret =
		    posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fallocate", fh->name);
#elif defined(HAVE_FTRUNCATE)
		if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0)
			WT_RET_MSG(session, ret, "%s: ftruncate", fh->name);
#endif
	}
#endif
	if ((ret =
	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
		if (!locked)
			__wt_spin_lock(session, &block->live_lock);
		WT_TRET(
		    __wt_block_off_free(session, block, offset, align_size));
		if (!locked)
			__wt_spin_unlock(session, &block->live_lock);
		WT_RET(ret);
	}

#ifdef HAVE_SYNC_FILE_RANGE
	/*
	 * Optionally schedule writes for dirty pages in the system buffer
	 * cache.
	 */
	if (block->os_cache_dirty_max != 0 &&
	    (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) {
		block->os_cache_dirty = 0;
		if ((ret = sync_file_range(fh->fd,
		    (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: sync_file_range", block->name);
	}
#endif
#ifdef HAVE_POSIX_FADVISE
	/* Optionally discard blocks from the system buffer cache. */
	if (block->os_cache_max != 0 &&
	    (block->os_cache += align_size) > block->os_cache_max) {
		block->os_cache = 0;
		if ((ret = posix_fadvise(fh->fd,
		    (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fadvise", block->name);
	}
#endif
	WT_CSTAT_INCR(session, block_write);
	WT_CSTAT_INCRV(session, block_byte_write, align_size);

	WT_VERBOSE_RET(session, write,
	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
	    (uintmax_t)offset, align_size, blk->cksum);

	*offsetp = offset;
	*sizep = align_size;
	*cksump = blk->cksum;

	return (ret);
}
Example #30
0
/*
 * __wt_rec_evict --
 *	Reconciliation plus eviction.
 */
int
__wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive)
{
	WT_DECL_RET;
	WT_PAGE_MODIFY *mod;
	int merge;

	WT_VERBOSE_RET(session, evict,
	    "page %p (%s)", page, __wt_page_type_string(page->type));

	WT_ASSERT(session, session->excl_next == 0);

	/*
	 * If we get a split-merge page during normal eviction, try to collapse
	 * it.  During close, it will be merged into its parent.
	 */
	mod = page->modify;
	merge = __wt_btree_mergeable(page);
	if (merge && exclusive)
		return (EBUSY);

	WT_ASSERT(session, merge || mod == NULL ||
	    !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE));

	/*
	 * Get exclusive access to the page and review the page and its subtree
	 * for conditions that would block our eviction of the page.  If the
	 * check fails (for example, we find a child page that can't be merged),
	 * we're done.  We have to make this check for clean pages, too: while
	 * unlikely eviction would choose an internal page with children, it's
	 * not disallowed anywhere.
	 *
	 * Note that page->ref may be NULL in some cases (e.g., for root pages
	 * or during salvage).  That's OK if exclusive is set: we won't check
	 * hazard pointers in that case.
	 */
	WT_ERR(__rec_review(session, page->ref, page, exclusive, merge, 1));

	/* Try to merge internal pages. */
	if (merge)
		WT_ERR(__wt_merge_tree(session, page));

	/*
	 * Update the page's modification reference, reconciliation might have
	 * changed it.
	 */
	mod = page->modify;

	/* Count evictions of internal pages during normal operation. */
	if (!exclusive && !merge &&
	    (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) {
		WT_CSTAT_INCR(session, cache_eviction_internal);
		WT_DSTAT_INCR(session, cache_eviction_internal);
	}

	/*
	 * Update the parent and discard the page.
	 */
	if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) {
		WT_ASSERT(session,
		    exclusive || page->ref->state == WT_REF_LOCKED);

		if (WT_PAGE_IS_ROOT(page))
			__rec_root_update(session);
		else
			__rec_page_clean_update(session, page);

		/* Discard the page. */
		__rec_discard_page(session, page, exclusive);

		WT_CSTAT_INCR(session, cache_eviction_clean);
		WT_DSTAT_INCR(session, cache_eviction_clean);
	} else {
		if (WT_PAGE_IS_ROOT(page))
			__rec_root_update(session);
		else
			WT_ERR(__rec_page_dirty_update(session, page));

		/* Discard the tree rooted in this page. */
		__rec_discard_tree(session, page, exclusive);

		WT_CSTAT_INCR(session, cache_eviction_dirty);
		WT_DSTAT_INCR(session, cache_eviction_dirty);
	}
	if (0) {
err:		/*
		 * If unable to evict this page, release exclusive reference(s)
		 * we've acquired.
		 */
		__rec_excl_clear(session);

		WT_CSTAT_INCR(session, cache_eviction_fail);
		WT_DSTAT_INCR(session, cache_eviction_fail);
	}
	session->excl_next = 0;

	return (ret);
}