Exemplo n.º 1
0
/*
 * __wt_btcur_insert --
 *	Insert a record into the tree.
 */
int
__wt_btcur_insert(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_CONN_INCR(session, cursor_insert);
	WT_STAT_DATA_INCR(session, cursor_insert);
	WT_STAT_DATA_INCRV(session,
	    cursor_insert_bytes, cursor->key.size + cursor->value.size);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));
	WT_RET(__cursor_size_chk(session, &cursor->value));

	/*
	 * The tree is no longer empty: eviction should pay attention to it,
	 * and it's no longer possible to bulk-load into it.
	 */
	if (btree->bulk_load_ok) {
		btree->bulk_load_ok = false;
		__wt_btree_evictable(session, true);
	}

retry:	WT_RET(__cursor_func_init(cbt, true));

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		/*
		 * If WT_CURSTD_APPEND is set, insert a new record (ignoring
		 * the application's record number). The real record number
		 * is assigned by the serialized append operation.
		 */
		if (F_ISSET(cursor, WT_CURSTD_APPEND))
			cbt->iface.recno = WT_RECNO_OOB;

		WT_ERR(__cursor_col_search(session, cbt, NULL));

		/*
		 * If not overwriting, fail if the key exists.  Creating a
		 * record past the end of the tree in a fixed-length
		 * column-store implicitly fills the gap with empty records.
		 * Fail in that case, the record exists.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
		    ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) ||
		    (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
			WT_ERR(WT_DUPLICATE_KEY);

		WT_ERR(__cursor_col_modify(session, cbt, false));
		if (F_ISSET(cursor, WT_CURSTD_APPEND))
			cbt->iface.recno = cbt->recno;
		break;
	case BTREE_ROW:
		WT_ERR(__cursor_row_search(session, cbt, NULL, true));
		/*
		 * If not overwriting, fail if the key exists, else insert the
		 * key/value pair.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
		    cbt->compare == 0 && __cursor_valid(cbt, NULL))
			WT_ERR(WT_DUPLICATE_KEY);

		ret = __cursor_row_modify(session, cbt, false);
		break;
	}

err:	if (ret == WT_RESTART) {
		WT_STAT_CONN_INCR(session, cursor_restart);
		WT_STAT_DATA_INCR(session, cursor_restart);
		goto retry;
	}
	/* Insert doesn't maintain a position across calls, clear resources. */
	if (ret == 0)
		WT_TRET(__curfile_leave(cbt));
	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
Exemplo n.º 2
0
/*
 * __wt_btcur_prev --
 *	Move to the previous record in the tree.
 */
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
{
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	uint32_t flags;
	bool newpage;

	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_STAT_CONN_INCR(session, cursor_prev);
	WT_STAT_DATA_INCR(session, cursor_prev);

	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);

	WT_RET(__cursor_func_init(cbt, false));

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
		__wt_btcur_iterate_setup(cbt);

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the previous page, until we reach the start
	 * of the file.
	 */
	flags = WT_READ_PREV | WT_READ_SKIP_INTL;	/* tree walk flags */
	LF_SET(WT_READ_NO_SPLIT);			/* don't try to split */
	if (truncating)
		LF_SET(WT_READ_TRUNCATE);
	for (newpage = false;; newpage = true) {
		page = cbt->ref == NULL ? NULL : cbt->ref->page;

		/*
		 * Column-store pages may have appended entries. Handle it
		 * separately from the usual cursor code, it's in a simple
		 * format.
		 */
		if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF &&
		    (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
			F_SET(cbt, WT_CBT_ITERATE_APPEND);

		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
			newpage = true;
		}
		if (page != NULL) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_prev(cbt, newpage);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;
		}

		/*
		 * If we saw a lot of deleted records on this page, or we went
		 * all the way through a page and only saw deleted records, try
		 * to evict the page when we release it.  Otherwise repeatedly
		 * deleting from the beginning of a tree can have quadratic
		 * performance.  Take care not to force eviction of pages that
		 * are genuinely empty, in new trees.
		 */
		if (page != NULL &&
		    (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD ||
		    (newpage && cbt->page_deleted_count > 0)))
			__wt_page_evict_soon(session, cbt->ref);
		cbt->page_deleted_count = 0;

		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
	}
#ifdef HAVE_DIAGNOSTIC
	if (ret == 0)
		WT_ERR(__wt_cursor_key_order_check(session, cbt, false));
#endif
	if (ret == 0)
		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
Exemplo n.º 3
0
/*
 * __wt_txn_rollback --
 *	Roll back the current transaction.
 */
int
__wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_OP *op;
	u_int i;
	bool readonly;

	WT_UNUSED(cfg);

	txn = &session->txn;
	readonly = txn->mod_count == 0;
	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));

	/* Rollback notification. */
	if (txn->notify != NULL)
		WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session,
		    txn->id, 0));

	/* Rollback updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
		/* Metadata updates are never rolled back. */
		if (op->fileid == WT_METAFILE_ID)
			continue;

		switch (op->type) {
		case WT_TXN_OP_BASIC:
		case WT_TXN_OP_BASIC_TS:
		case WT_TXN_OP_INMEM:
			WT_ASSERT(session, op->u.upd->txnid == txn->id);
			WT_ASSERT(session,
			    S2C(session)->cache->las_fileid == 0 ||
			    op->fileid != S2C(session)->cache->las_fileid);
			op->u.upd->txnid = WT_TXN_ABORTED;
			break;
		case WT_TXN_OP_REF:
			__wt_delete_page_rollback(session, op->u.ref);
			break;
		case WT_TXN_OP_TRUNCATE_COL:
		case WT_TXN_OP_TRUNCATE_ROW:
			/*
			 * Nothing to do: these operations are only logged for
			 * recovery.  The in-memory changes will be rolled back
			 * with a combination of WT_TXN_OP_REF and
			 * WT_TXN_OP_INMEM operations.
			 */
			break;
		}

		/* Free any memory allocated for the operation. */
		__wt_txn_op_free(session, op);
	}
	txn->mod_count = 0;

	__wt_txn_release(session);
	/*
	 * We're between transactions, if we need to block for eviction, it's
	 * a good time to do so.  Note that we must ignore any error return
	 * because the user's data is committed.
	 */
	if (!readonly)
		(void)__wt_cache_eviction_check(session, false, false, NULL);
	return (ret);
}
Exemplo n.º 4
0
/*
 * ex_aci --
 *	Append, change, insert in ex.
 */
static int
ex_aci(SCR *sp, EXCMD *cmdp, enum which cmd)
{
	CHAR_T *p, *t;
	GS *gp;
	TEXT *tp;
	TEXTH tiq[] = {{ 0 }};
	recno_t cnt = 0, lno;
	size_t len;
	u_int32_t flags;
	int need_newline;

	gp = sp->gp;
	NEEDFILE(sp, cmdp);

	/*
	 * If doing a change, replace lines for as long as possible.  Then,
	 * append more lines or delete remaining lines.  Changes to an empty
	 * file are appends, inserts are the same as appends to the previous
	 * line.
	 *
	 * !!!
	 * Set the address to which we'll append.  We set sp->lno to this
	 * address as well so that autoindent works correctly when get text
	 * from the user.
	 */
	lno = cmdp->addr1.lno;
	sp->lno = lno;
	if ((cmd == CHANGE || cmd == INSERT) && lno != 0)
		--lno;

	/*
	 * !!!
	 * If the file isn't empty, cut changes into the unnamed buffer.
	 */
	if (cmd == CHANGE && cmdp->addr1.lno != 0 &&
	    (cut(sp, NULL, &cmdp->addr1, &cmdp->addr2, CUT_LINEMODE) ||
	    del(sp, &cmdp->addr1, &cmdp->addr2, 1)))
		return (1);

	/*
	 * !!!
	 * Anything that was left after the command separator becomes part
	 * of the inserted text.  Apparently, it was common usage to enter:
	 *
	 *	:g/pattern/append|stuff1
	 *
	 * and append the line of text "stuff1" to the lines containing the
	 * pattern.  It was also historically legal to enter:
	 *
	 *	:append|stuff1
	 *	stuff2
	 *	.
	 *
	 * and the text on the ex command line would be appended as well as
	 * the text inserted after it.  There was an historic bug however,
	 * that the user had to enter *two* terminating lines (the '.' lines)
	 * to terminate text input mode, in this case.  This whole thing
	 * could be taken too far, however.  Entering:
	 *
	 *	:append|stuff1\
	 *	stuff2
	 *	stuff3
	 *	.
	 *
	 * i.e. mixing and matching the forms confused the historic vi, and,
	 * not only did it take two terminating lines to terminate text input
	 * mode, but the trailing backslashes were retained on the input.  We
	 * match historic practice except that we discard the backslashes.
	 *
	 * Input lines specified on the ex command line lines are separated by
	 * <newline>s.  If there is a trailing delimiter an empty line was
	 * inserted.  There may also be a leading delimiter, which is ignored
	 * unless it's also a trailing delimiter.  It is possible to encounter
	 * a termination line, i.e. a single '.', in a global command, but not
	 * necessary if the text insert command was the last of the global
	 * commands.
	 */
	if (cmdp->save_cmdlen != 0) {
		for (p = cmdp->save_cmd,
		    len = cmdp->save_cmdlen; len > 0; p = t) {
			for (t = p; len > 0 && t[0] != '\n'; ++t, --len);
			if (t != p || len == 0) {
				if (F_ISSET(sp, SC_EX_GLOBAL) &&
				    t - p == 1 && p[0] == '.') {
					++t;
					if (len > 0)
						--len;
					break;
				}
				if (db_append(sp, 1, lno++, p, t - p))
					return (1);
			}
			if (len != 0) {
				++t;
				if (--len == 0 &&
				    db_append(sp, 1, lno++, NULL, 0))
					return (1);
			}
		}
		/*
		 * If there's any remaining text, we're in a global, and
		 * there's more command to parse.
		 *
		 * !!!
		 * We depend on the fact that non-global commands will eat the
		 * rest of the command line as text input, and before getting
		 * any text input from the user.  Otherwise, we'd have to save
		 * off the command text before or during the call to the text
		 * input function below.
		 */
		if (len != 0)
			cmdp->save_cmd = t;
		cmdp->save_cmdlen = len;
	}

	if (F_ISSET(sp, SC_EX_GLOBAL)) {
		if ((sp->lno = lno) == 0 && db_exist(sp, 1))
			sp->lno = 1;
		return (0);
	}

	/*
	 * If not in a global command, read from the terminal.
	 *
	 * If this code is called by vi, we want to reset the terminal and use
	 * ex's line get routine.  It actually works fine if we use vi's get
	 * routine, but it doesn't look as nice.  Maybe if we had a separate
	 * window or something, but getting a line at a time looks awkward.
	 * However, depending on the screen that we're using, that may not
	 * be possible.
	 */
	if (F_ISSET(sp, SC_VI)) {
		if (gp->scr_screen(sp, SC_EX)) {
			ex_wemsg(sp, cmdp->cmd->name, EXM_NOCANON);
			return (1);
		}

		/* If we're still in the vi screen, move out explicitly. */
		need_newline = !F_ISSET(sp, SC_SCR_EXWROTE);
		F_SET(sp, SC_SCR_EX | SC_SCR_EXWROTE);
		if (need_newline)
			(void)ex_puts(sp, "\n");

		/*
		 * !!!
		 * Users of historical versions of vi sometimes get confused
		 * when they enter append mode, and can't seem to get out of
		 * it.  Give them an informational message.
		 */
		(void)ex_puts(sp,
		    msg_cat(sp, "273|Entering ex input mode.", NULL));
		(void)ex_puts(sp, "\n");
		(void)ex_fflush(sp);
	}

	/*
	 * Set input flags; the ! flag turns off autoindent for append,
	 * change and insert.
	 */
	LF_INIT(TXT_DOTTERM | TXT_NUMBER);
	if (!FL_ISSET(cmdp->iflags, E_C_FORCE) && O_ISSET(sp, O_AUTOINDENT))
		LF_SET(TXT_AUTOINDENT);
	if (O_ISSET(sp, O_BEAUTIFY))
		LF_SET(TXT_BEAUTIFY);

	/*
	 * This code can't use the common screen TEXTH structure (sp->tiq),
	 * as it may already be in use, e.g. ":append|s/abc/ABC/" would fail
	 * as we are only halfway through the text when the append code fires.
	 * Use a local structure instead.  (The ex code would have to use a
	 * local structure except that we're guaranteed to finish remaining
	 * characters in the common TEXTH structure when they were inserted
	 * into the file, above.)
	 */
	TAILQ_INIT(tiq);

	if (ex_txt(sp, tiq, 0, flags))
		return (1);

	TAILQ_FOREACH(tp, tiq, q) {
		if (db_append(sp, 1, lno++, tp->lb, tp->len))
			return (1);
		++cnt;
	}

	/*
	 * Set sp->lno to the final line number value (correcting for a
	 * possible 0 value) as that's historically correct for the final
	 * line value, whether or not the user entered any text.
	 */
	if ((sp->lno = lno) == 0 && db_exist(sp, 1))
		sp->lno = 1;

	return (0);
}
Exemplo n.º 5
0
/*
 * cl_event --
 *	Return a single event.
 *
 * PUBLIC: int cl_event __P((SCR *, EVENT *, u_int32_t, int));
 */
int
cl_event(SCR *sp, EVENT *evp, u_int32_t flags, int ms)
{
	struct timeval t, *tp;
	CL_PRIVATE *clp;
	size_t lines, columns;
	int changed, nr = 0;
	CHAR_T *wp;
	size_t wlen;
	int rc;

	/*
	 * Queue signal based events.  We never clear SIGHUP or SIGTERM events,
	 * so that we just keep returning them until the editor dies.
	 */
	clp = CLP(sp);
retest:	if (LF_ISSET(EC_INTERRUPT) || F_ISSET(clp, CL_SIGINT)) {
		if (F_ISSET(clp, CL_SIGINT)) {
			F_CLR(clp, CL_SIGINT);
			evp->e_event = E_INTERRUPT;
		} else
			evp->e_event = E_TIMEOUT;
		return (0);
	}
	if (F_ISSET(clp, CL_SIGHUP | CL_SIGTERM | CL_SIGWINCH)) {
		if (F_ISSET(clp, CL_SIGHUP)) {
			evp->e_event = E_SIGHUP;
			return (0);
		}
		if (F_ISSET(clp, CL_SIGTERM)) {
			evp->e_event = E_SIGTERM;
			return (0);
		}
		if (F_ISSET(clp, CL_SIGWINCH)) {
			F_CLR(clp, CL_SIGWINCH);
			if (cl_ssize(sp, 1, &lines, &columns, &changed))
				return (1);
			if (changed) {
				(void)cl_resize(sp, lines, columns);
				evp->e_event = E_WRESIZE;
				return (0);
			}
			/* No real change, ignore the signal. */
		}
	}

	/* Set timer. */
	if (ms == 0)
		tp = NULL;
	else {
		t.tv_sec = ms / 1000;
		t.tv_usec = (ms % 1000) * 1000;
		tp = &t;
	}

	/* Read input characters. */
read:
	switch (cl_read(sp, LF_ISSET(EC_QUOTED | EC_RAW),
	    clp->ibuf + clp->skip, SIZE(clp->ibuf) - clp->skip, &nr, tp)) {
	case INP_OK:
		rc = INPUT2INT5(sp, clp->cw, clp->ibuf, nr + clp->skip,
				wp, wlen);
		evp->e_csp = wp;
		evp->e_len = wlen;
		evp->e_event = E_STRING;
		if (rc < 0) {
		    int n = -rc;
		    memmove(clp->ibuf, clp->ibuf + nr + clp->skip - n, n);
		    clp->skip = n;
		    if (wlen == 0)
			goto read;
		} else if (rc == 0)
		    clp->skip = 0;
		else
		    msgq(sp, M_ERR, "323|Invalid input. Truncated.");
		break;
	case INP_EOF:
		evp->e_event = E_EOF;
		break;
	case INP_ERR:
		evp->e_event = E_ERR;
		break;
	case INP_INTR:
		goto retest;
	case INP_TIMEOUT:
		evp->e_event = E_TIMEOUT;
		break;
	default:
		abort();
	}
	return (0);
}
Exemplo n.º 6
0
/*
 * __rec_review --
 *	Get exclusive access to the page and review the page and its subtree
 *	for conditions that would block its eviction.
 *
 *	The ref and page arguments may appear to be redundant, because usually
 *	ref->page == page and page->ref == ref.  However, we need both because
 *	(a) there are cases where ref == NULL (e.g., for root page or during
 *	salvage), and (b) we can't safely look at page->ref until we have a
 *	hazard reference.
 */
static int
__rec_review(WT_SESSION_IMPL *session,
    WT_REF *ref, WT_PAGE *page, uint32_t flags, int top)
{
	WT_DECL_RET;
	WT_PAGE_MODIFY *mod;
	WT_TXN *txn;
	uint32_t i;

	txn = &session->txn;

	/*
	 * Get exclusive access to the page if our caller doesn't have the tree
	 * locked down.
	 */
	if (!LF_ISSET(WT_REC_SINGLE))
		WT_RET(__hazard_exclusive(session, ref, top));

	/*
	 * Recurse through the page's subtree: this happens first because we
	 * have to write pages in depth-first order, otherwise we'll dirty
	 * pages after we've written them.
	 */
	if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)
		WT_REF_FOREACH(page, ref, i)
			switch (ref->state) {
			case WT_REF_DISK:		/* On-disk */
			case WT_REF_DELETED:		/* On-disk, deleted */
				break;
			case WT_REF_MEM:		/* In-memory */
				WT_RET(__rec_review(
				    session, ref, ref->page, flags, 0));
				break;
			case WT_REF_EVICT_WALK:		/* Walk point */
			case WT_REF_LOCKED:		/* Being evicted */
			case WT_REF_READING:		/* Being read */
				return (EBUSY);
			}

	/*
	 * Check if this page can be evicted:
	 *
	 * Fail if the top-level page is a page expected to be removed from the
	 * tree as part of eviction (an empty page or a split-merge page).  Note
	 * "split" pages are NOT included in this test, because a split page can
	 * be separately evicted, at which point it's replaced in its parent by
	 * a reference to a split-merge page.  That's a normal part of the leaf
	 * page life-cycle if it grows too large and must be pushed out of the
	 * cache.  There is also an exception for empty pages, the root page may
	 * be empty when evicted, but that only happens when the tree is closed.
	 *
	 * Fail if any page in the top-level page's subtree can't be merged into
	 * its parent.  You can't evict a page that references such in-memory
	 * pages, they must be evicted first.  The test is necessary but should
	 * not fire much: the LRU-based eviction code is biased for leaf pages,
	 * an internal page shouldn't be selected for LRU-based eviction until
	 * its children have been evicted.  Empty, split and split-merge pages
	 * are all included in this test, they can all be merged into a parent.
	 *
	 * We have to write dirty pages to know their final state, a page marked
	 * empty may have had records added since reconciliation, a page marked
	 * split may have had records deleted and no longer need to split.
	 * Split-merge pages are the exception: they can never be change into
	 * anything other than a split-merge page and are merged regardless of
	 * being clean or dirty.
	 *
	 * Writing the page is expensive, do a cheap test first: if it doesn't
	 * appear a subtree page can be merged, quit.  It's possible the page
	 * has been emptied since it was last reconciled, and writing it before
	 * testing might be worthwhile, but it's more probable we're attempting
	 * to evict an internal page with live children, and that's a waste of
	 * time.
	 *
	 * We don't do a cheap test for the top-level page: we're not called
	 * to evict split-merge pages, which means the only interesting case
	 * is an empty page.  If the eviction thread picked an "empty" page
	 * for eviction, it must have had reason, probably the empty page got
	 * really, really full.
	 */
	mod = page->modify;
	if (!top && (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)))
		return (EBUSY);

	/* If the page is dirty, write it so we know the final state. */
	if (__wt_page_is_modified(page) &&
	    !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) {
		ret = __wt_rec_write(session, page, NULL, flags);

		/* If there are unwritten changes on the page, give up. */
		if (ret == 0 &&
		    !LF_ISSET(WT_REC_SINGLE) && __wt_page_is_modified(page))
			ret = EBUSY;
		if (ret == EBUSY) {
			WT_VERBOSE_RET(session, evict,
			    "page %p written but not clean", page);

			if (F_ISSET(txn, TXN_RUNNING) &&
			    ++txn->eviction_fails >= 100) {
				txn->eviction_fails = 0;
				ret = WT_DEADLOCK;
				WT_STAT_INCR(
				    S2C(session)->stats, txn_fail_cache);
			}

			/*
			 * If there aren't multiple cursors active, there
			 * are no consistency issues: try to bump our snapshot.
			 */
			if (session->ncursors <= 1) {
				__wt_txn_read_last(session);
				__wt_txn_read_first(session);
			}

			switch (page->type) {
			case WT_PAGE_COL_FIX:
			case WT_PAGE_COL_VAR:
				__wt_col_leaf_obsolete(session, page);
				break;
			case WT_PAGE_ROW_LEAF:
				__wt_row_leaf_obsolete(session, page);
				break;
			}
		}
		WT_RET(ret);

		txn->eviction_fails = 0;
	}

	/*
	 * Repeat the eviction tests.
	 *
	 * Fail if the top-level page should be merged into its parent, and it's
	 * not the root page.
	 *
	 * Fail if a page in the top-level page's subtree can't be merged into
	 * its parent.
	 */
	if (top) {
		/*
		 * We never get a top-level split-merge page to evict, they are
		 * ignored by the eviction thread.  Check out of sheer paranoia.
		 */
		if (mod != NULL) {
			if (F_ISSET(mod, WT_PM_REC_SPLIT_MERGE))
				return (EBUSY);
			if (F_ISSET(mod, WT_PM_REC_EMPTY) &&
			    !WT_PAGE_IS_ROOT(page))
				return (EBUSY);
		}
	} else if (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))
		return (EBUSY);
	return (0);
}
Exemplo n.º 7
0
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_DECL_RET;
	WT_PAGE *page;
	int busy, force_attempts, oldgen;

	for (force_attempts = oldgen = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, attempt to read it.
			 * Make sure there is space in the cache.
			 */
			WT_RET(__wt_cache_full_check(session));
			WT_RET(__wt_cache_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			/* FALLTHROUGH */
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			/* The page is busy -- wait. */
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory: get a hazard pointer, update
			 * the page's LRU and return.  The expected reason we
			 * can't get a hazard pointer is because the page is
			 * being evicted; yield and try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy)
				break;

			page = ref->page;
			WT_ASSERT(session, page != NULL);

			/* Forcibly evict pages that are too big. */
			if (!LF_ISSET(WT_READ_NO_EVICT) &&
			    force_attempts < 10 &&
			    __evict_force_check(session, page)) {
				++force_attempts;
				WT_RET(__wt_page_release(session, ref, flags));
				break;
			}

			/* Check if we need an autocommit transaction. */
			if ((ret = __wt_txn_autocommit_check(session)) != 0) {
				WT_TRET(__wt_hazard_clear(session, page));
				return (ret);
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_set(session);

			return (0);
		WT_ILLEGAL_VALUE(session);
		}

		/* We failed to get the page -- yield before retrying. */
		__wt_yield();
	}
}
Exemplo n.º 8
0
/*
 * v_cmd --
 *	Get a vi command.
 */
static gcret_t
v_cmd(SCR *sp, VICMD *dp, VICMD *vp, VICMD *ismotion, int *comcountp, int *mappedp)
	        
	               
	                	/* Previous key if getting motion component. */
	                         
{
	enum { COMMANDMODE, ISPARTIAL, NOTPARTIAL } cpart;
	ARG_CHAR_T key;
	VIKEYS const *kp;
	gcret_t gcret;
	u_int flags;
	const char *s;

	/*
	 * Get an event command or a key.  Event commands are simple, and
	 * don't have any additional information.
	 */
	cpart = ismotion == NULL ? COMMANDMODE : ISPARTIAL;
	gcret = v_key(sp, vp, 1, EC_MAPCOMMAND);
	if (gcret != GC_OK) {
		if (gcret != GC_EVENT)
			return (gcret);
		if (v_event(sp, vp))
			return (GC_ERR);
		if (ismotion != NULL && !F_ISSET(vp->kp, V_MOVE))
			v_event_err(sp, &vp->ev);
		return (GC_EVENT);
	}

	/*
	 * Keys are not simple.  (Although vi's command structure less complex
	 * than ex (and don't think I'm not grateful!)  The command syntax is:
	 *
	 *	[count] [buffer] [count] key [[motion] | [buffer] [character]]
	 *
	 * and there are, of course, several special cases.  The motion value
	 * is itself a vi command, with the syntax:
	 *
	 *	[count] key [character]
	 *
	 * <escape> cancels partial commands, i.e. a command where at least
	 * one non-numeric character has been entered.  Otherwise, it beeps
	 * the terminal.
	 *
	 * !!!
	 * POSIX 1003.2-1992 explicitly disallows cancelling commands where
	 * all that's been entered is a number, requiring that the terminal
	 * be alerted.
	 */
	if (vp->ev.e_value == K_ESCAPE)
		goto esc;

	/*
	 * Commands that are mapped are treated differently (e.g., they
	 * don't set the dot command.  Pass that information back.
	 */
	if (FL_ISSET(vp->ev.e_flags, CH_MAPPED))
		*mappedp = 1;
	key = vp->ev.e_c;

	if (ismotion == NULL)
		cpart = NOTPARTIAL;

	/* Pick up an optional buffer. */
	if (key == '"') {
		cpart = ISPARTIAL;
		if (ismotion != NULL) {
			v_emsg(sp, NULL, VIM_COMBUF);
			return (GC_ERR);
		}
		KEY(vp->buffer, 0);
		F_SET(vp, VC_BUFFER);

		KEY(key, EC_MAPCOMMAND);
	}

	/*
	 * Pick up an optional count, where a leading 0 isn't a count, it's
	 * a command.  When a count is specified, the dot command behaves
	 * differently, pass the information back.
	 */
	if (ISDIGIT(key) && key != '0') {
		if (v_count(sp, vp, key, &vp->count))
			return (GC_ERR);

		F_SET(vp, VC_C1SET);
		*comcountp = 1;

		KEY(key, EC_MAPCOMMAND);
	} else
		*comcountp = 0;

	/* Pick up optional buffer. */
	if (key == '"') {
		cpart = ISPARTIAL;
		if (F_ISSET(vp, VC_BUFFER)) {
			msgq(sp, M_ERR, "234|Only one buffer may be specified");
			return (GC_ERR);
		}
		if (ismotion != NULL) {
			v_emsg(sp, NULL, VIM_COMBUF);
			return (GC_ERR);
		}
		KEY(vp->buffer, 0);
		F_SET(vp, VC_BUFFER);

		KEY(key, EC_MAPCOMMAND);
	}

	/* Check for an OOB command key. */
	cpart = ISPARTIAL;
	if (key > MAXVIKEY) {
		v_emsg(sp, (const char *)KEY_NAME(sp, key), VIM_NOCOM);
		return (GC_ERR);
	}
	kp = &vikeys[vp->key = key];

	/*
	 * !!!
	 * Historically, D accepted and then ignored a count.  Match it.
	 */
	if (vp->key == 'D' && F_ISSET(vp, VC_C1SET)) {
		*comcountp = 0;
		vp->count = 0;
		F_CLR(vp, VC_C1SET);
	}

	/*
	 * There are several commands that we implement as aliases, both
	 * to match historic practice and to ensure consistency.  Check
	 * for command aliases.
	 */
	if (kp->func == NULL && (kp = v_alias(sp, vp, kp)) == NULL)
		return (GC_ERR);

	/* The tildeop option makes the ~ command take a motion. */
	if (key == '~' && O_ISSET(sp, O_TILDEOP))
		kp = &tmotion;

	vp->kp = kp;

	/*
	 * Find the command.  The only legal command with no underlying
	 * function is dot.  It's historic practice that <escape> doesn't
	 * just erase the preceding number, it beeps the terminal as well.
	 * It's a common problem, so just beep the terminal unless verbose
	 * was set.
	 */
	if (kp->func == NULL) {
		if (key != '.') {
			v_emsg(sp, (const char *)KEY_NAME(sp, key),
			    vp->ev.e_value == K_ESCAPE ?
			    VIM_NOCOM_B : VIM_NOCOM);
			return (GC_ERR);
		}

		/* If called for a motion command, stop now. */
		if (dp == NULL)
			goto usage;

		/*
		 * !!!
		 * If a '.' is immediately entered after an undo command, we
		 * replay the log instead of redoing the last command.  This
		 * is necessary because 'u' can't set the dot command -- see
		 * vi/v_undo.c:v_undo for details.
		 */
		if (VIP(sp)->u_ccnt == sp->ccnt) {
			vp->kp = &vikeys['u'];
			F_SET(vp, VC_ISDOT);
			return (GC_OK);
		}

		/* Otherwise, a repeatable command must have been executed. */
		if (!F_ISSET(dp, VC_ISDOT)) {
			msgq(sp, M_ERR, "208|No command to repeat");
			return (GC_ERR);
		}

		/* Set new count/buffer, if any, and return. */
		if (F_ISSET(vp, VC_C1SET)) {
			F_SET(dp, VC_C1SET);
			dp->count = vp->count;
		}
		if (F_ISSET(vp, VC_BUFFER))
			dp->buffer = vp->buffer;

		*vp = *dp;
		return (GC_OK);
	}

	/* Set the flags based on the command flags. */
	flags = kp->flags;

	/* Check for illegal count. */
	if (F_ISSET(vp, VC_C1SET) && !LF_ISSET(V_CNT))
		goto usage;

	/* Illegal motion command. */
	if (ismotion == NULL) {
		/* Illegal buffer. */
		if (!LF_ISSET(V_OBUF) && F_ISSET(vp, VC_BUFFER))
			goto usage;

		/* Required buffer. */
		if (LF_ISSET(V_RBUF)) {
			KEY(vp->buffer, 0);
			F_SET(vp, VC_BUFFER);
		}
	}

	/*
	 * Special case: '[', ']' and 'Z' commands.  Doesn't the fact that
	 * the *single* characters don't mean anything but the *doubled*
	 * characters do, just frost your shorts?
	 */
	if (vp->key == '[' || vp->key == ']' || vp->key == 'Z') {
		/*
		 * Historically, half entered [[, ]] or Z commands weren't
		 * cancelled by <escape>, the terminal was beeped instead.
		 * POSIX.2-1992 probably didn't notice, and requires that
		 * they be cancelled instead of beeping.  Seems fine to me.
		 *
		 * Don't set the EC_MAPCOMMAND flag, apparently ] is a popular
		 * vi meta-character, and we don't want the user to wait while
		 * we time out a possible mapping.  This *appears* to match
		 * historic vi practice, but with mapping characters, You Just
		 * Never Know.
		 */
		KEY(key, 0);

		if (vp->key != key) {
usage:			if (ismotion == NULL)
				s = kp->usage;
			else if (ismotion->key == '~' && O_ISSET(sp, O_TILDEOP))
				s = tmotion.usage;
			else
				s = vikeys[ismotion->key].usage;
			v_emsg(sp, s, VIM_USAGE);
			return (GC_ERR);
		}
	}
	/* Special case: 'z' command. */
	if (vp->key == 'z') {
		KEY(vp->character, 0);
		if (ISDIGIT(vp->character)) {
			if (v_count(sp, vp, vp->character, &vp->count2))
				return (GC_ERR);
			F_SET(vp, VC_C2SET);
			KEY(vp->character, 0);
		}
	}

	/*
	 * Commands that have motion components can be doubled to imply the
	 * current line.
	 */
	if (ismotion != NULL && ismotion->key != key && !LF_ISSET(V_MOVE)) {
		msgq(sp, M_ERR, "210|%s may not be used as a motion command",
		    KEY_NAME(sp, key));
		return (GC_ERR);
	}

	/* Pick up required trailing character. */
	if (LF_ISSET(V_CHAR))
		KEY(vp->character, 0);

	/* Get any associated cursor word. */
	if (F_ISSET(kp, V_KEYW) && v_curword(sp))
		return (GC_ERR);

	return (GC_OK);

esc:	switch (cpart) {
	case COMMANDMODE:
		msgq(sp, M_BERR, "211|Already in command mode");
		return (GC_ERR_NOFLUSH);
	case ISPARTIAL:
		break;
	case NOTPARTIAL:
		(void)sp->gp->scr_bell(sp);
		break;
	}
	return (GC_ERR);
}
Exemplo n.º 9
0
/*
 * vi --
 * 	Main vi command loop.
 *
 * PUBLIC: int vi __P((SCR **));
 */
int
vi(SCR **spp)
{
	GS *gp;
	WIN *wp;
	MARK abst;
	SCR *next, *sp;
	VICMD cmd, *vp;
	VI_PRIVATE *vip;
	int comcount, mapped, rval;

	/* Get the first screen. */
	sp = *spp;
	wp = sp->wp;
	gp = sp->gp;

	/* Initialize the command structure. */
	vp = &cmd;
	memset(vp, 0, sizeof(VICMD));

	/* Reset strange attraction. */
	F_SET(vp, VM_RCM_SET);

	/* Initialize the vi screen. */
	if (v_init(sp))
		return (1);

	/* Set the focus. */
	(void)sp->gp->scr_rename(sp, sp->frp->name, 1);

	for (vip = VIP(sp), rval = 0;;) {
		/* Resolve messages. */
		if (!MAPPED_KEYS_WAITING(sp) && vs_resolve(sp, NULL, 0))
			goto ret;

		/*
		 * If not skipping a refresh, return to command mode and
		 * refresh the screen.
		 */
		if (F_ISSET(vip, VIP_S_REFRESH))
			F_CLR(vip, VIP_S_REFRESH);
		else {
			sp->showmode = SM_COMMAND;
			if (vs_refresh(sp, 0))
				goto ret;
		}

		/* Set the new favorite position. */
		if (F_ISSET(vp, VM_RCM_SET | VM_RCM_SETFNB | VM_RCM_SETNNB)) {
			F_CLR(vip, VIP_RCM_LAST);
			(void)vs_column(sp, &sp->rcm);
		}

		/*
		 * If not currently in a map, log the cursor position,
		 * and set a flag so that this command can become the
		 * DOT command.
		 */
		if (MAPPED_KEYS_WAITING(sp))
			mapped = 1;
		else {
			if (log_cursor(sp))
				goto err;
			mapped = 0;
		}

		/*
		 * There may be an ex command waiting, and we returned here
		 * only because we exited a screen or file.  In this case,
		 * we simply go back into the ex parser.
		 */
		if (EXCMD_RUNNING(wp)) {
			vp->kp = &vikeys[':'];
			goto ex_continue;
		}

		/* Refresh the command structure. */
		memset(vp, 0, sizeof(VICMD));

		/*
		 * We get a command, which may or may not have an associated
		 * motion.  If it does, we get it too, calling its underlying
		 * function to get the resulting mark.  We then call the
		 * command setting the cursor to the resulting mark.
		 *
		 * !!!
		 * Vi historically flushed mapped characters on error, but
		 * entering extra <escape> characters at the beginning of
		 * a map wasn't considered an error -- in fact, users would
		 * put leading <escape> characters in maps to clean up vi
		 * state before the map was interpreted.  Beauty!
		 */
		switch (v_cmd(sp, DOT, vp, NULL, &comcount, &mapped)) {
		case GC_ERR:
			goto err;
		case GC_ERR_NOFLUSH:
			goto gc_err_noflush;
		case GC_FATAL:
			goto ret;
		case GC_INTERRUPT:
			goto intr;
		case GC_EVENT:
		case GC_OK:
			break;
		}

		/* Check for security setting. */
		if (F_ISSET(vp->kp, V_SECURE) && O_ISSET(sp, O_SECURE)) {
			ex_emsg(sp, (const char *)KEY_NAME(sp, vp->key),
			    EXM_SECURE);
			goto err;
		}

		/*
		 * Historical practice: if a dot command gets a new count,
		 * any motion component goes away, i.e. "d3w2." deletes a
		 * total of 5 words.
		 */
		if (F_ISSET(vp, VC_ISDOT) && comcount)
			DOTMOTION->count = 1;

		/* Copy the key flags into the local structure. */
		F_SET(vp, vp->kp->flags);

		/* Prepare to set the previous context. */
		if (F_ISSET(vp, V_ABS | V_ABS_C | V_ABS_L)) {
			abst.lno = sp->lno;
			abst.cno = sp->cno;
		}

		/*
		 * Set the three cursor locations to the current cursor.  The
		 * underlying routines don't bother if the cursor doesn't move.
		 * This also handles line commands (e.g. Y) defaulting to the
		 * current line.
		 */
		vp->m_start.lno = vp->m_stop.lno = vp->m_final.lno = sp->lno;
		vp->m_start.cno = vp->m_stop.cno = vp->m_final.cno = sp->cno;

		/*
		 * Do any required motion; v_motion sets the from MARK and the
		 * line mode flag, as well as the VM_RCM flags.
		 */
		if (F_ISSET(vp, V_MOTION) &&
		    v_motion(sp, DOTMOTION, vp, &mapped)) {
			if (INTERRUPTED(sp))
				goto intr;
			goto err;
		}

		/*
		 * If a count is set and the command is line oriented, set the
		 * to MARK here relative to the cursor/from MARK.  This is for
		 * commands that take both counts and motions, i.e. "4yy" and
		 * "y%".  As there's no way the command can know which the user
		 * did, we have to do it here.  (There are commands that are
		 * line oriented and that take counts ("#G", "#H"), for which
		 * this calculation is either completely meaningless or wrong.
		 * Each command must validate the value for itself.
		 */
		if (F_ISSET(vp, VC_C1SET) && F_ISSET(vp, VM_LMODE))
			vp->m_stop.lno += vp->count - 1;

		/* Increment the command count. */
		++sp->ccnt;

#if defined(DEBUG) && defined(COMLOG)
		v_comlog(sp, vp);
#endif
		/* Call the function. */
ex_continue:	if (vp->kp->func(sp, vp))
			goto err;
#ifdef DEBUG
		/* Make sure no function left the temporary space locked. */
		if (F_ISSET(wp, W_TMP_INUSE)) {
			F_CLR(wp, W_TMP_INUSE);
			msgq(sp, M_ERR,
			    "232|vi: temporary buffer not released");
		}
#endif
		/*
		 * If we're exiting this screen, move to the next one, or, if
		 * there aren't any more, return to the main editor loop.  The
		 * ordering is careful, don't discard the contents of sp until
		 * the end.
		 */
		if (F_ISSET(sp, SC_EXIT | SC_EXIT_FORCE)) {
			if (file_end(sp, NULL, F_ISSET(sp, SC_EXIT_FORCE)))
				goto ret;
			if (vs_discard(sp, &next))
				goto ret;
			if (next == NULL && vs_swap(sp, &next, NULL))
				goto ret;
			*spp = next;
			if (screen_end(sp))
				goto ret;
			if (next == NULL)
				break;

			/* Switch screens, change focus. */
			sp = next;
			vip = VIP(sp);
			(void)sp->gp->scr_rename(sp, sp->frp->name, 1);

			/* Don't trust the cursor. */
			F_SET(vip, VIP_CUR_INVALID);

			continue;
		}

		/*
		 * Set the dot command structure.
		 *
		 * !!!
		 * Historically, commands which used mapped keys did not
		 * set the dot command, with the exception of the text
		 * input commands.
		 */
		if (F_ISSET(vp, V_DOT) && !mapped) {
			*DOT = cmd;
			F_SET(DOT, VC_ISDOT);

			/*
			 * If a count was supplied for both the command and
			 * its motion, the count was used only for the motion.
			 * Turn the count back on for the dot structure.
			 */
			if (F_ISSET(vp, VC_C1RESET))
				F_SET(DOT, VC_C1SET);

			/* VM flags aren't retained. */
			F_CLR(DOT, VM_COMMASK | VM_RCM_MASK);
		}

		/*
		 * Some vi row movements are "attracted" to the last position
		 * set, i.e. the VM_RCM commands are moths to the VM_RCM_SET
		 * commands' candle.  If the movement is to the EOL the vi
		 * command handles it.  If it's to the beginning, we handle it
		 * here.
		 *
		 * Note, some commands (e.g. _, ^) don't set the VM_RCM_SETFNB
		 * flag, but do the work themselves.  The reason is that they
		 * have to modify the column in case they're being used as a
		 * motion component.  Other similar commands (e.g. +, -) don't
		 * have to modify the column because they are always line mode
		 * operations when used as motions, so the column number isn't
		 * of any interest.
		 *
		 * Does this totally violate the screen and editor layering?
		 * You betcha.  As they say, if you think you understand it,
		 * you don't.
		 */
		switch (F_ISSET(vp, VM_RCM_MASK)) {
		case 0:
		case VM_RCM_SET:
			break;
		case VM_RCM:
			vp->m_final.cno = vs_rcm(sp,
			    vp->m_final.lno, F_ISSET(vip, VIP_RCM_LAST));
			break;
		case VM_RCM_SETLAST:
			F_SET(vip, VIP_RCM_LAST);
			break;
		case VM_RCM_SETFNB:
			vp->m_final.cno = 0;
			/* FALLTHROUGH */
		case VM_RCM_SETNNB:
			if (nonblank(sp, vp->m_final.lno, &vp->m_final.cno))
				goto err;
			break;
		default:
			abort();
		}

		/* Update the cursor. */
		sp->lno = vp->m_final.lno;
		sp->cno = vp->m_final.cno;

		/*
		 * Set the absolute mark -- set even if a tags or similar
		 * command, since the tag may be moving to the same file.
		 */
		if ((F_ISSET(vp, V_ABS) ||
		    (F_ISSET(vp, V_ABS_L) && sp->lno != abst.lno) ||
		    (F_ISSET(vp, V_ABS_C) &&
		    (sp->lno != abst.lno || sp->cno != abst.cno))) &&
		    mark_set(sp, ABSMARK1, &abst, 1))
			goto err;

		if (0) {
err:			if (v_event_flush(sp, CH_MAPPED))
				msgq(sp, M_BERR,
			    "110|Vi command failed: mapped keys discarded");
		}

		/*
		 * Check and clear interrupts.  There's an obvious race, but
		 * it's not worth fixing.
		 */
gc_err_noflush:	if (INTERRUPTED(sp)) {
intr:			CLR_INTERRUPT(sp);
			if (v_event_flush(sp, CH_MAPPED))
				msgq(sp, M_ERR,
				    "231|Interrupted: mapped keys discarded");
			else
				msgq(sp, M_ERR, "236|Interrupted");
		}

		/* If the last command switched screens, update. */
		if (F_ISSET(sp, SC_SSWITCH)) {
			F_CLR(sp, SC_SSWITCH);

			/*
			 * If the current screen is still displayed, it will
			 * need a new status line.
			 */
			F_SET(sp, SC_STATUS);

			/* Switch screens, change focus. */
			sp = sp->nextdisp;
			vip = VIP(sp);
			(void)sp->gp->scr_rename(sp, sp->frp->name, 1);

			/* Don't trust the cursor. */
			F_SET(vip, VIP_CUR_INVALID);

			/* Refresh so we can display messages. */
			if (vs_refresh(sp, 1))
				return (1);
		}

		/* If the last command switched files, change focus. */
		if (F_ISSET(sp, SC_FSWITCH)) {
			F_CLR(sp, SC_FSWITCH);
			(void)sp->gp->scr_rename(sp, sp->frp->name, 1);
		}

		/* If leaving vi, return to the main editor loop. */
		if (F_ISSET(gp, G_SRESTART) || F_ISSET(sp, SC_EX)) {
			*spp = sp;
			v_dtoh(sp);
			gp->scr_discard(sp, NULL);
			break;
		}
	}
	if (0)
ret:		rval = 1;
	return (rval);
}
Exemplo n.º 10
0
DB *
__rec_open(const char *fname, int flags, mode_t mode, const RECNOINFO *openinfo,
    int dflags)
{
	BTREE *t;
	BTREEINFO btopeninfo;
	DB *dbp;
	PAGE *h;
	struct stat sb;
	int rfd = -1;	/* pacify gcc */
	int sverrno;

	dbp = NULL;
	/* Open the user's file -- if this fails, we're done. */
	if (fname != NULL) {
		if ((rfd = open(fname, flags | O_CLOEXEC, mode)) == -1)
			return NULL;
	}

	/* Create a btree in memory (backed by disk). */
	if (openinfo) {
		if (openinfo->flags & ~(R_FIXEDLEN | R_NOKEY | R_SNAPSHOT))
			goto einval;
		btopeninfo.flags = 0;
		btopeninfo.cachesize = openinfo->cachesize;
		btopeninfo.maxkeypage = 0;
		btopeninfo.minkeypage = 0;
		btopeninfo.psize = openinfo->psize;
		btopeninfo.compare = NULL;
		btopeninfo.prefix = NULL;
		btopeninfo.lorder = openinfo->lorder;
		dbp = __bt_open(openinfo->bfname,
		    O_RDWR, S_IRUSR | S_IWUSR, &btopeninfo, dflags);
	} else
		dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, NULL, dflags);
	if (dbp == NULL)
		goto err;

	/*
	 * Some fields in the tree structure are recno specific.  Fill them
	 * in and make the btree structure look like a recno structure.  We
	 * don't change the bt_ovflsize value, it's close enough and slightly
	 * bigger.
	 */
	t = dbp->internal;
	if (openinfo) {
		if (openinfo->flags & R_FIXEDLEN) {
			F_SET(t, R_FIXLEN);
			t->bt_reclen = openinfo->reclen;
			if (t->bt_reclen == 0)
				goto einval;
		}
		t->bt_bval = openinfo->bval;
	} else
		t->bt_bval = '\n';

	F_SET(t, R_RECNO);
	if (fname == NULL)
		F_SET(t, R_EOF | R_INMEM);
	else
		t->bt_rfd = rfd;

	if (fname != NULL) {
		/*
		 * In 4.4BSD, stat(2) returns true for ISSOCK on pipes.
		 * Unfortunately, that's not portable, so we use lseek
		 * and check the errno values.
		 */
		errno = 0;
		if (lseek(rfd, (off_t)0, SEEK_CUR) == -1 && errno == ESPIPE) {
			switch (flags & O_ACCMODE) {
			case O_RDONLY:
				F_SET(t, R_RDONLY);
				break;
			default:
				goto einval;
			}
slow:			if ((t->bt_rfp = fdopen(rfd, "r")) == NULL)
				goto err;
			F_SET(t, R_CLOSEFP);
			t->bt_irec =
			    F_ISSET(t, R_FIXLEN) ? __rec_fpipe : __rec_vpipe;
		} else {
			switch (flags & O_ACCMODE) {
			case O_RDONLY:
				F_SET(t, R_RDONLY);
				break;
			case O_RDWR:
				break;
			default:
				goto einval;
			}

			if (fstat(rfd, &sb))
				goto err;
			/*
			 * Kluge -- we'd like to test to see if the file is too
			 * big to mmap.  Since, we don't know what size or type
			 * off_t's or size_t's are, what the largest unsigned
			 * integral type is, or what random insanity the local
			 * C compiler will perpetrate, doing the comparison in
			 * a portable way is flatly impossible.  Hope that mmap
			 * fails if the file is too large.
			 */
			if (sb.st_size == 0)
				F_SET(t, R_EOF);
			else {
#ifdef MMAP_NOT_AVAILABLE
				/*
				 * XXX
				 * Mmap doesn't work correctly on many current
				 * systems.  In particular, it can fail subtly,
				 * with cache coherency problems.  Don't use it
				 * for now.
				 */
				t->bt_msize = sb.st_size;
				if ((t->bt_smap = mmap(NULL, t->bt_msize,
				    PROT_READ, MAP_FILE | MAP_PRIVATE, rfd,
				    (off_t)0)) == (caddr_t)-1)
					goto slow;
				t->bt_cmap = t->bt_smap;
				t->bt_emap = t->bt_smap + sb.st_size;
				t->bt_irec = F_ISSET(t, R_FIXLEN) ?
				    __rec_fmap : __rec_vmap;
				F_SET(t, R_MEMMAPPED);
#else
				goto slow;
#endif
			}
		}
	}

	/* Use the recno routines. */
	dbp->close = __rec_close;
	dbp->del = __rec_delete;
	dbp->fd = __rec_fd;
	dbp->get = __rec_get;
	dbp->put = __rec_put;
	dbp->seq = __rec_seq;
	dbp->sync = __rec_sync;

	/* If the root page was created, reset the flags. */
	if ((h = mpool_get(t->bt_mp, P_ROOT, 0)) == NULL)
		goto err;
	if ((h->flags & P_TYPE) == P_BLEAF) {
		F_CLR(h, P_TYPE);
		F_SET(h, P_RLEAF);
		mpool_put(t->bt_mp, h, MPOOL_DIRTY);
	} else
		mpool_put(t->bt_mp, h, 0);

	if (openinfo && openinfo->flags & R_SNAPSHOT &&
	    !F_ISSET(t, R_EOF | R_INMEM) &&
	    t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR)
                goto err;
	return (dbp);

einval:	errno = EINVAL;
err:	sverrno = errno;
	if (dbp != NULL)
		(void)__bt_close(dbp);
	if (fname != NULL)
		(void)close(rfd);
	errno = sverrno;
	return (NULL);
}
Exemplo n.º 11
0
/*
 * __wt_page_out --
 *	Discard an in-memory page, freeing all memory associated with it.
 */
void
__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
{
	WT_PAGE *page;
	WT_PAGE_HEADER *dsk;
	WT_PAGE_MODIFY *mod;

	/*
	 * Kill our caller's reference, do our best to catch races.
	 */
	page = *pagep;
	*pagep = NULL;

	if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
		__wt_page_modify_clear(session, page);

	/*
	 * We should never discard:
	 * - a dirty page,
	 * - a page queued for eviction, or
	 * - a locked page.
	 */
	WT_ASSERT(session, !__wt_page_is_modified(page));
	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
	WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock));

	/*
	 * If a root page split, there may be one or more pages linked from the
	 * page; walk the list, discarding pages.
	 */
	switch (page->type) {
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		mod = page->modify;
		if (mod != NULL && mod->mod_root_split != NULL)
			__wt_page_out(session, &mod->mod_root_split);
		break;
	}

	/* Update the cache's information. */
	__wt_cache_page_evict(session, page);

	dsk = (WT_PAGE_HEADER *)page->dsk;
	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
		__wt_cache_page_image_decr(session, dsk->mem_size);

	/* Discard any mapped image. */
	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
		(void)S2BT(session)->bm->map_discard(
		    S2BT(session)->bm, session, dsk, (size_t)dsk->mem_size);

	/*
	 * If discarding the page as part of process exit, the application may
	 * configure to leak the memory rather than do the work.
	 */
	if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY))
		return;

	/* Free the page modification information. */
	if (page->modify != NULL)
		__free_page_modify(session, page);

	switch (page->type) {
	case WT_PAGE_COL_FIX:
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		__free_page_int(session, page);
		break;
	case WT_PAGE_COL_VAR:
		__free_page_col_var(session, page);
		break;
	case WT_PAGE_ROW_LEAF:
		__free_page_row_leaf(session, page);
		break;
	}

	/* Discard any allocated disk image. */
	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
		__wt_overwrite_and_free_len(session, dsk, dsk->mem_size);

	__wt_overwrite_and_free(session, page);
}
Exemplo n.º 12
0
/*
 * __cursor_valid --
 *	Return if the cursor references an valid key/value pair.
 */
static inline bool
__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
{
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_COL *cip;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	btree = cbt->btree;
	page = cbt->ref->page;
	session = (WT_SESSION_IMPL *)cbt->iface.session;
	if (updp != NULL)
		*updp = NULL;

	/*
	 * We may be pointing to an insert object, and we may have a page with
	 * existing entries.  Insert objects always have associated update
	 * objects (the value).  Any update object may be deleted, or invisible
	 * to us.  In the case of an on-page entry, there is by definition a
	 * value that is visible to us, the original page cell.
	 *
	 * If we find a visible update structure, return our caller a reference
	 * to it because we don't want to repeatedly search for the update, it
	 * might suddenly become invisible (imagine a read-uncommitted session
	 * with another session's aborted insert), and we don't want to handle
	 * that potential error every time we look at the value.
	 *
	 * Unfortunately, the objects we might have and their relationships are
	 * different for the underlying page types.
	 *
	 * In the case of row-store, an insert object implies ignoring any page
	 * objects, no insert object can have the same key as an on-page object.
	 * For row-store:
	 *	if there's an insert object:
	 *		if there's a visible update:
	 *			exact match
	 *		else
	 *			no exact match
	 *	else
	 *		use the on-page object (which may have an associated
	 *		update object that may or may not be visible to us).
	 *
	 * Column-store is more complicated because an insert object can have
	 * the same key as an on-page object: updates to column-store rows
	 * are insert/object pairs, and an invisible update isn't the end as
	 * there may be an on-page object that is visible.  This changes the
	 * logic to:
	 *	if there's an insert object:
	 *		if there's a visible update:
	 *			exact match
	 *		else if the on-page object's key matches the insert key
	 *			use the on-page object
	 *	else
	 *		use the on-page object
	 *
	 * First, check for an insert object with a visible update (a visible
	 * update that's been deleted is not a valid key/value pair).
	 */
	if (cbt->ins != NULL &&
	    (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
		if (WT_UPDATE_DELETED_ISSET(upd))
			return (false);
		if (updp != NULL)
			*updp = upd;
		return (true);
	}

	/*
	 * If we don't have an insert object, or in the case of column-store,
	 * there's an insert object but no update was visible to us and the key
	 * on the page is the same as the insert object's key, and the slot as
	 * set by the search function is valid, we can use the original page
	 * information.
	 */
	switch (btree->type) {
	case BTREE_COL_FIX:
		/*
		 * If search returned an insert object, there may or may not be
		 * a matching on-page object, we have to check.  Fixed-length
		 * column-store pages don't have slots, but map one-to-one to
		 * keys, check for retrieval past the end of the page.
		 */
		if (cbt->recno >= cbt->ref->ref_recno + page->pg_fix_entries)
			return (false);

		/*
		 * An update would have appeared as an "insert" object; no
		 * further checks to do.
		 */
		break;
	case BTREE_COL_VAR:
		/* The search function doesn't check for empty pages. */
		if (page->pg_var_entries == 0)
			return (false);
		WT_ASSERT(session, cbt->slot < page->pg_var_entries);

		/*
		 * Column-store updates are stored as "insert" objects. If
		 * search returned an insert object we can't return, the
		 * returned on-page object must be checked for a match.
		 */
		if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH))
			return (false);

		/*
		 * Although updates would have appeared as an "insert" objects,
		 * variable-length column store deletes are written into the
		 * backing store; check the cell for a record already deleted
		 * when read.
		 */
		cip = &page->pg_var_d[cbt->slot];
		if ((cell = WT_COL_PTR(page, cip)) == NULL ||
		    __wt_cell_type(cell) == WT_CELL_DEL)
			return (false);
		break;
	case BTREE_ROW:
		/* The search function doesn't check for empty pages. */
		if (page->pg_row_entries == 0)
			return (false);
		WT_ASSERT(session, cbt->slot < page->pg_row_entries);

		/*
		 * See above: for row-store, no insert object can have the same
		 * key as an on-page object, we're done.
		 */
		if (cbt->ins != NULL)
			return (false);

		/* Check for an update. */
		if (page->modify != NULL &&
		    page->modify->mod_row_update != NULL &&
		    (upd = __wt_txn_read(session,
		    page->modify->mod_row_update[cbt->slot])) != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd))
				return (false);
			if (updp != NULL)
				*updp = upd;
		}
		break;
	}
	return (true);
}
Exemplo n.º 13
0
/*
 * __wt_btcur_update --
 *	Update a record in the tree.
 */
int
__wt_btcur_update(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_CONN_INCR(session, cursor_update);
	WT_STAT_DATA_INCR(session, cursor_update);
	WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));
	WT_RET(__cursor_size_chk(session, &cursor->value));

	/*
	 * The tree is no longer empty: eviction should pay attention to it,
	 * and it's no longer possible to bulk-load into it.
	 */
	if (btree->bulk_load_ok) {
		btree->bulk_load_ok = false;
		__wt_btree_evictable(session, true);
	}

retry:	WT_RET(__cursor_func_init(cbt, true));

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		WT_ERR(__cursor_col_search(session, cbt, NULL));

		/*
		 * If not overwriting, fail if the key doesn't exist.  If we
		 * find an update for the key, check for conflicts.  Update the
		 * record if it exists.  Creating a record past the end of the
		 * tree in a fixed-length column-store implicitly fills the gap
		 * with empty records.  Update the record in that case, the
		 * record exists.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
			WT_ERR(__curfile_update_check(cbt));
			if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) &&
			    !__cursor_fix_implicit(btree, cbt))
				WT_ERR(WT_NOTFOUND);
		}
		ret = __cursor_col_modify(session, cbt, false);
		break;
	case BTREE_ROW:
		WT_ERR(__cursor_row_search(session, cbt, NULL, true));
		/*
		 * If not overwriting, check for conflicts and fail if the key
		 * does not exist.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
			WT_ERR(__curfile_update_check(cbt));
			if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
				WT_ERR(WT_NOTFOUND);
		}
		ret = __cursor_row_modify(session, cbt, false);
		break;
	}

err:	if (ret == WT_RESTART) {
		WT_STAT_CONN_INCR(session, cursor_restart);
		WT_STAT_DATA_INCR(session, cursor_restart);
		goto retry;
	}

	/*
	 * If successful, point the cursor at internal copies of the data.  We
	 * could shuffle memory in the cursor so the key/value pair are in local
	 * buffer memory, but that's a data copy.  We don't want to do another
	 * search (and we might get a different update structure if we race).
	 * To make this work, we add a field to the btree cursor to pass back a
	 * pointer to the modify function's allocated update structure.
	 */
	if (ret == 0)
		WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update));

	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
Exemplo n.º 14
0
/*
 * __wt_btcur_remove --
 *	Remove a record from the tree.
 */
int
__wt_btcur_remove(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_CONN_INCR(session, cursor_remove);
	WT_STAT_DATA_INCR(session, cursor_remove);
	WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);

retry:	WT_RET(__cursor_func_init(cbt, true));

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		WT_ERR(__cursor_col_search(session, cbt, NULL));

		/*
		 * If we find a matching record, check whether an update would
		 * conflict.  Do this before checking if the update is visible
		 * in __cursor_valid, or we can miss conflict.
		 */
		WT_ERR(__curfile_update_check(cbt));

		/* Remove the record if it exists. */
		if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) {
			if (!__cursor_fix_implicit(btree, cbt))
				WT_ERR(WT_NOTFOUND);
			/*
			 * Creating a record past the end of the tree in a
			 * fixed-length column-store implicitly fills the
			 * gap with empty records.  Return success in that
			 * case, the record was deleted successfully.
			 *
			 * Correct the btree cursor's location: the search
			 * will have pointed us at the previous/next item,
			 * and that's not correct.
			 */
			cbt->recno = cursor->recno;
		} else
			ret = __cursor_col_modify(session, cbt, true);
		break;
	case BTREE_ROW:
		/* Remove the record if it exists. */
		WT_ERR(__cursor_row_search(session, cbt, NULL, false));

		/* Check whether an update would conflict. */
		WT_ERR(__curfile_update_check(cbt));

		if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
			WT_ERR(WT_NOTFOUND);

		ret = __cursor_row_modify(session, cbt, true);
		break;
	}

err:	if (ret == WT_RESTART) {
		WT_STAT_CONN_INCR(session, cursor_restart);
		WT_STAT_DATA_INCR(session, cursor_restart);
		goto retry;
	}
	/*
	 * If the cursor is configured to overwrite and the record is not
	 * found, that is exactly what we want.
	 */
	if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND)
		ret = 0;

	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));

	return (ret);
}
Exemplo n.º 15
0
/*
 * __wt_verify --
 *	Verify a file.
 */
int
__wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CKPT *ckptbase, *ckpt;
	WT_DECL_RET;
	WT_VSTUFF *vs, _vstuff;
	uint32_t root_addr_size;
	uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];

	btree = S2BT(session);
	bm = btree->bm;
	ckptbase = NULL;

	WT_CLEAR(_vstuff);
	vs = &_vstuff;
	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));

	/* Check configuration strings. */
	WT_ERR(__verify_config(session, cfg, vs));

	/* Get a list of the checkpoints for this file. */
	WT_ERR(
	    __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase));

	/* Inform the underlying block manager we're verifying. */
	WT_ERR(bm->verify_start(bm, session, ckptbase));

	/* Loop through the file's checkpoints, verifying each one. */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		WT_VERBOSE_ERR(session, verify,
		    "%s: checkpoint %s", btree->dhandle->name, ckpt->name);
#ifdef HAVE_DIAGNOSTIC
		if (vs->dump_address || vs->dump_blocks || vs->dump_pages)
			WT_ERR(__wt_msg(session, "%s: checkpoint %s",
			    btree->dhandle->name, ckpt->name));
#endif

		/* Fake checkpoints require no work. */
		if (F_ISSET(ckpt, WT_CKPT_FAKE))
			continue;

		/* House-keeping between checkpoints. */
		__verify_checkpoint_reset(vs);

		/* Load the checkpoint, ignore trees with no root page. */
		WT_ERR(bm->checkpoint_load(bm, session,
		    ckpt->raw.data, ckpt->raw.size,
		    root_addr, &root_addr_size, 1));
		if (root_addr_size != 0) {
			/* Verify then discard the checkpoint from the cache. */
			if ((ret = __wt_btree_tree_open(
			    session, root_addr, root_addr_size)) == 0) {
				ret = __verify_tree(
				    session, btree->root_page, vs);
				WT_TRET(__wt_bt_cache_op(
				    session, NULL, WT_SYNC_DISCARD_NOWRITE));
			}
		}

		/* Unload the checkpoint. */
		WT_TRET(bm->checkpoint_unload(bm, session));
		WT_ERR(ret);
	}
Exemplo n.º 16
0
/*
 * v_motion --
 *
 * Get resulting motion mark.
 */
static int
v_motion(SCR *sp, VICMD *dm, VICMD *vp, int *mappedp)
{
	VICMD motion;
	gcret_t gcret;
	size_t len;
	u_long cnt;
	u_int flags;
	int tilde_reset, notused;

	/*
	 * If '.' command, use the dot motion, else get the motion command.
	 * Clear any line motion flags, the subsequent motion isn't always
	 * the same, i.e. "/aaa" may or may not be a line motion.
	 */
	if (F_ISSET(vp, VC_ISDOT)) {
		motion = *dm;
		F_SET(&motion, VC_ISDOT);
		F_CLR(&motion, VM_COMMASK);
		gcret = GC_OK;
	} else {
		memset(&motion, 0, sizeof(VICMD));
		gcret = v_cmd(sp, NULL, &motion, vp, &notused, mappedp);
		if (gcret != GC_OK && gcret != GC_EVENT)
			return (1);
	}

	/*
	 * A count may be provided both to the command and to the motion, in
	 * which case the count is multiplicative.  For example, "3y4y" is the
	 * same as "12yy".  This count is provided to the motion command and
	 * not to the regular function.
	 */
	cnt = motion.count = F_ISSET(&motion, VC_C1SET) ? motion.count : 1;
	if (F_ISSET(vp, VC_C1SET)) {
		motion.count *= vp->count;
		F_SET(&motion, VC_C1SET);

		/*
		 * Set flags to restore the original values of the command
		 * structure so dot commands can change the count values,
		 * e.g. "2dw" "3." deletes a total of five words.
		 */
		F_CLR(vp, VC_C1SET);
		F_SET(vp, VC_C1RESET);
	}

	/*
	 * Some commands can be repeated to indicate the current line.  In
	 * this case, or if the command is a "line command", set the flags
	 * appropriately.  If not a doubled command, run the function to get
	 * the resulting mark.
 	 */
	if (gcret != GC_EVENT && vp->key == motion.key) {
		F_SET(vp, VM_LDOUBLE | VM_LMODE);

		/* Set the origin of the command. */
		vp->m_start.lno = sp->lno;
		vp->m_start.cno = 0;

		/*
		 * Set the end of the command.
		 *
		 * If the current line is missing, i.e. the file is empty,
		 * historic vi permitted a "cc" or "!!" command to insert
		 * text.
		 */
		vp->m_stop.lno = sp->lno + motion.count - 1;
		if (db_get(sp, vp->m_stop.lno, 0, NULL, &len)) {
			if (vp->m_stop.lno != 1 ||
			   (vp->key != 'c' && vp->key != '!')) {
				v_emsg(sp, NULL, VIM_EMPTY);
				return (1);
			}
			vp->m_stop.cno = 0;
		} else
			vp->m_stop.cno = len ? len - 1 : 0;
	} else {
		/*
		 * Motion commands change the underlying movement (*snarl*).
		 * For example, "l" is illegal at the end of a line, but "dl"
		 * is not.  Set flags so the function knows the situation.
		 */
		motion.rkp = vp->kp;

		/*
		 * XXX
		 * Use yank instead of creating a new motion command, it's a
		 * lot easier for now.
		 */
		if (vp->kp == &tmotion) {
			tilde_reset = 1;
			vp->kp = &vikeys['y'];
		} else
			tilde_reset = 0;

		/*
		 * Copy the key flags into the local structure, except for the
		 * RCM flags -- the motion command will set the RCM flags in
		 * the vp structure if necessary.  This means that the motion
		 * command is expected to determine where the cursor ends up!
		 * However, we save off the current RCM mask and restore it if
		 * it no RCM flags are set by the motion command, with a small
		 * modification.
		 *
		 * We replace the VM_RCM_SET flag with the VM_RCM flag.  This
		 * is so that cursor movement doesn't set the relative position
		 * unless the motion command explicitly specified it.  This
		 * appears to match historic practice, but I've never been able
		 * to develop a hard-and-fast rule.
		 */
		flags = F_ISSET(vp, VM_RCM_MASK);
		if (LF_ISSET(VM_RCM_SET)) {
			LF_SET(VM_RCM);
			LF_CLR(VM_RCM_SET);
		}
		F_CLR(vp, VM_RCM_MASK);
		F_SET(&motion, motion.kp->flags & ~VM_RCM_MASK);

		/*
		 * Set the three cursor locations to the current cursor.  This
		 * permits commands like 'j' and 'k', that are line oriented
		 * motions and have special cursor suck semantics when they are
		 * used as standalone commands, to ignore column positioning.
		 */
		motion.m_final.lno =
		    motion.m_stop.lno = motion.m_start.lno = sp->lno;
		motion.m_final.cno =
		    motion.m_stop.cno = motion.m_start.cno = sp->cno;

		/* Run the function. */
		if ((motion.kp->func)(sp, &motion))
			return (1);

		/*
		 * If the current line is missing, i.e. the file is empty,
		 * historic vi allowed "c<motion>" or "!<motion>" to insert
		 * text.  Otherwise fail -- most motion commands will have
		 * already failed, but some, e.g. G, succeed in empty files.
		 */
		if (!db_exist(sp, vp->m_stop.lno)) {
			if (vp->m_stop.lno != 1 ||
			   (vp->key != 'c' && vp->key != '!')) {
				v_emsg(sp, NULL, VIM_EMPTY);
				return (1);
			}
			vp->m_stop.cno = 0;
		}

		/*
		 * XXX
		 * See above.
		 */
		if (tilde_reset)
			vp->kp = &tmotion;

		/*
		 * Copy cut buffer, line mode and cursor position information
		 * from the motion command structure, i.e. anything that the
		 * motion command can set for us.  The commands can flag the
		 * movement as a line motion (see v_sentence) as well as set
		 * the VM_RCM_* flags explicitly.
		 */
		F_SET(vp, F_ISSET(&motion, VM_COMMASK | VM_RCM_MASK));

		/*
		 * If the motion command set no relative motion flags, use
		 * the (slightly) modified previous values.
		 */
		if (!F_ISSET(vp, VM_RCM_MASK))
			F_SET(vp, flags);

		/*
		 * Commands can change behaviors based on the motion command
		 * used, for example, the ! command repeated the last bang
		 * command if N or n was used as the motion.
		 */
		vp->rkp = motion.kp;

		/*
		 * Motion commands can reset all of the cursor information.
		 * If the motion is in the reverse direction, switch the
		 * from and to MARK's so that it's in a forward direction.
		 * Motions are from the from MARK to the to MARK (inclusive).
		 */
		if (motion.m_start.lno > motion.m_stop.lno ||
		    (motion.m_start.lno == motion.m_stop.lno &&
		    motion.m_start.cno > motion.m_stop.cno)) {
			vp->m_start = motion.m_stop;
			vp->m_stop = motion.m_start;
		} else {
			vp->m_start = motion.m_start;
			vp->m_stop = motion.m_stop;
		}
		vp->m_final = motion.m_final;
	}

	/*
	 * If the command sets dot, save the motion structure.  The motion
	 * count was changed above and needs to be reset, that's why this
	 * is done here, and not in the calling routine.
	 */
	if (F_ISSET(vp->kp, V_DOT)) {
		*dm = motion;
		dm->count = cnt;
	}
	return (0);
}
Exemplo n.º 17
0
/*
 * __wt_rec_evict --
 *	Reconciliation plus eviction.
 */
int
__wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	int single;

	conn = S2C(session);

	WT_VERBOSE_RET(session, evict,
	    "page %p (%s)", page, __wt_page_type_string(page->type));

	WT_ASSERT(session, session->excl_next == 0);
	single = LF_ISSET(WT_REC_SINGLE) ? 1 : 0;

	/*
	 * Get exclusive access to the page and review the page and its subtree
	 * for conditions that would block our eviction of the page.  If the
	 * check fails (for example, we find a child page that can't be merged),
	 * we're done.  We have to make this check for clean pages, too: while
	 * unlikely eviction would choose an internal page with children, it's
	 * not disallowed anywhere.
	 *
	 * Note that page->ref may be NULL in some cases (e.g., for root pages
	 * or during salvage).  That's OK if WT_REC_SINGLE is set: we won't
	 * check hazard references in that case.
	 */
	WT_ERR(__rec_review(session, page->ref, page, flags, 1));

	/* Count evictions of internal pages during normal operation. */
	if (!single &&
	    (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT))
		WT_STAT_INCR(conn->stats, cache_evict_internal);

	/* Update the parent and discard the page. */
	if (page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_MASK)) {
		WT_STAT_INCR(conn->stats, cache_evict_unmodified);
		WT_ASSERT(session, single || page->ref->state == WT_REF_LOCKED);

		if (WT_PAGE_IS_ROOT(page))
			__rec_root_update(session);
		else
			__rec_page_clean_update(session, page);

		/* Discard the page. */
		__rec_discard_page(session, page, single);
	} else {
		WT_STAT_INCR(conn->stats, cache_evict_modified);

		if (WT_PAGE_IS_ROOT(page))
			__rec_root_update(session);
		else
			WT_ERR(__rec_page_dirty_update(session, page));

		/* Discard the tree rooted in this page. */
		__rec_discard_tree(session, page, single);
	}
	if (0) {
err:		/*
		 * If unable to evict this page, release exclusive reference(s)
		 * we've acquired.
		 */
		__rec_excl_clear(session);
	}
	session->excl_next = 0;

	return (ret);
}
Exemplo n.º 18
0
/*
 * __wt_curds_create --
 *	Initialize a data-source cursor.
 */
int
__wt_curds_create(WT_SESSION_IMPL *session, const char *uri,
    const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    NULL,			/* get-key */
	    NULL,			/* get-value */
	    NULL,			/* set-key */
	    NULL,			/* set-value */
	    NULL,			/* compare */
	    __curds_next,		/* next */
	    __curds_prev,		/* prev */
	    __curds_reset,		/* reset */
	    __curds_search,		/* search */
	    __curds_search_near,	/* search-near */
	    __curds_insert,		/* insert */
	    __curds_update,		/* update */
	    __curds_remove,		/* remove */
	    __curds_close);		/* close */
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor, *dsc;
	WT_DECL_RET;
	const char *metaconf;

	metaconf = NULL;

	/* Open the WiredTiger cursor. */
	WT_RET(__wt_calloc_def(session, 1, &cursor));
	*cursor = iface;
	cursor->session = (WT_SESSION *)session;

	/*
	 * XXX
	 * We'll need the object's key and value formats.
	 */
	WT_ERR(__wt_metadata_search(session, uri, &metaconf));
	WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format));
	WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval));
	WT_ERR(
	    __wt_strndup(session, cval.str, cval.len, &cursor->value_format));

	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));

	WT_ERR(dsrc->open_cursor(dsrc,
	    &session->iface, uri, (WT_CONFIG_ARG *)cfg, &dsc));
	dsc->session = (WT_SESSION *)session;
	memset(&dsc->q, 0, sizeof(dsc->q));
	dsc->recno = 0;
	memset(dsc->raw_recno_buf, 0, sizeof(dsc->raw_recno_buf));
	memset(&dsc->key, 0, sizeof(dsc->key));
	memset(&dsc->value, 0, sizeof(dsc->value));
	memset(&dsc->saved_err, 0, sizeof(dsc->saved_err));
	dsc->data_source = NULL;
	memset(&dsc->flags, 0, sizeof(dsc->flags));

	/* Reference the underlying application cursor. */
	cursor->data_source = dsc;

	if (0) {
err:		if (F_ISSET(cursor, WT_CURSTD_OPEN))
			WT_TRET(cursor->close(cursor));
		else
			__wt_free(session, cursor);
	}

	__wt_free(session, metaconf);
	return (ret);
}
Exemplo n.º 19
0
/*
 * __wt_page_inmem --
 *	Build in-memory page information.
 */
int
__wt_page_inmem(WT_SESSION_IMPL *session,
    WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep)
{
	WT_DECL_RET;
	WT_PAGE *page;
	const WT_PAGE_HEADER *dsk;
	uint32_t alloc_entries;
	size_t size;

	*pagep = NULL;

	dsk = image;
	alloc_entries = 0;

	/*
	 * Figure out how many underlying objects the page references so we can
	 * allocate them along with the page.
	 */
	switch (dsk->type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
		/*
		 * Column-store leaf page entries map one-to-one to the number
		 * of physical entries on the page (each physical entry is a
		 * value item).
		 *
		 * Column-store internal page entries map one-to-one to the
		 * number of physical entries on the page (each entry is a
		 * location cookie).
		 */
		alloc_entries = dsk->u.entries;
		break;
	case WT_PAGE_ROW_INT:
		/*
		 * Row-store internal page entries map one-to-two to the number
		 * of physical entries on the page (each entry is a key and
		 * location cookie pair).
		 */
		alloc_entries = dsk->u.entries / 2;
		break;
	case WT_PAGE_ROW_LEAF:
		/*
		 * If the "no empty values" flag is set, row-store leaf page
		 * entries map one-to-one to the number of physical entries
		 * on the page (each physical entry is a key or value item).
		 * If that flag is not set, there are more keys than values,
		 * we have to walk the page to figure it out.
		 */
		if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL))
			alloc_entries = dsk->u.entries;
		else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
			alloc_entries = dsk->u.entries / 2;
		else
			WT_RET(__inmem_row_leaf_entries(
			    session, dsk, &alloc_entries));
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* Allocate and initialize a new WT_PAGE. */
	WT_RET(__wt_page_alloc(
	    session, dsk->type, dsk->recno, alloc_entries, 1, &page));
	page->dsk = dsk;
	F_SET_ATOMIC(page, flags);

	/*
	 * Track the memory allocated to build this page so we can update the
	 * cache statistics in a single call.
	 */
	size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0;

	switch (page->type) {
	case WT_PAGE_COL_FIX:
		__inmem_col_fix(session, page);
		break;
	case WT_PAGE_COL_INT:
		__inmem_col_int(session, page);
		break;
	case WT_PAGE_COL_VAR:
		WT_ERR(__inmem_col_var(session, page, &size));
		break;
	case WT_PAGE_ROW_INT:
		WT_ERR(__inmem_row_int(session, page, &size));
		break;
	case WT_PAGE_ROW_LEAF:
		WT_ERR(__inmem_row_leaf(session, page));
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

	/* Update the page's in-memory size and the cache statistics. */
	__wt_cache_page_inmem_incr(session, page, size);

	/* Link the new internal page to the parent. */
	if (ref != NULL) {
		switch (page->type) {
		case WT_PAGE_COL_INT:
		case WT_PAGE_ROW_INT:
			page->pg_intl_parent_ref = ref;
			break;
		}
		ref->page = page;
	}

	*pagep = page;
	return (0);

err:	__wt_page_out(session, &page);
	return (ret);
}
Exemplo n.º 20
0
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	int busy, force_attempts, oldgen;

	for (force_attempts = oldgen = 0, wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, attempt to read it.
			 * Make sure there is space in the cache.
			 */
			WT_RET(__wt_cache_full_check(session));
			WT_RET(__wt_cache_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory.
			 *
			 * Get a hazard pointer if one is required. We cannot
			 * be evicting if no hazard pointer is required, we're
			 * done.
			 */
			if (F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY))
				goto skip_evict;

			/*
			 * The expected reason we can't get a hazard pointer is
			 * because the page is being evicted, yield, try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			/*
			 * If eviction is configured for this file, check to see
			 * if the page qualifies for forced eviction and update
			 * the page's generation number. If eviction isn't being
			 * done on this file, we're done.
			 */
			if (F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION))
				goto skip_evict;

			/*
			 * Forcibly evict pages that are too big.
			 */
			page = ref->page;
			if (force_attempts < 10 &&
			    __evict_force_check(session, page, flags)) {
				++force_attempts;
				ret = __wt_page_release_evict(session, ref);
				/* If forced eviction fails, stall. */
				if (ret == EBUSY) {
					ret = 0;
					wait_cnt += 1000;
					WT_STAT_FAST_CONN_INCR(session,
					    page_forcible_evict_blocked);
					break;
				} else
					WT_RET(ret);

				/*
				 * The result of a successful forced eviction
				 * is a page-state transition (potentially to
				 * an in-memory page we can use, or a restart
				 * return for our caller), continue the outer
				 * page-acquisition loop.
				 */
				continue;
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen != WT_READGEN_OLDEST &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_set(session);
skip_evict:
			/*
			 * Check if we need an autocommit transaction.
			 * Starting a transaction can trigger eviction, so skip
			 * it if eviction isn't permitted.
			 */
			return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
			    __wt_txn_autocommit_check(session));
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (++wait_cnt < 1000)
			__wt_yield();
		else {
			sleep_cnt = WT_MIN(wait_cnt, 10000);
			wait_cnt *= 2;
			WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
			__wt_sleep(0, sleep_cnt);
		}
	}
}
Exemplo n.º 21
0
/*
 * __bt_search --
 *	Search a btree for a key.
 *
 * Parameters:
 *	t:	tree to search
 *	key:	key to find
 *	exactp:	pointer to exact match flag
 *
 * Returns:
 *	The EPG for matching record, if any, or the EPG for the location
 *	of the key, if it were inserted into the tree, is entered into
 *	the bt_cur field of the tree.  A pointer to the field is returned.
 */
EPG *
__bt_search(BTREE *t, const DBT *key, int *exactp)
{
	PAGE *h;
	indx_t base, idx, lim;
	pgno_t pg;
	int cmp;

	BT_CLR(t);
	for (pg = P_ROOT;;) {
		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
			return (NULL);

		/* Do a binary search on the current page. */
		t->bt_cur.page = h;
		for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) {
			t->bt_cur.index = idx = base + (lim >> 1);
			if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) {
				if (h->flags & P_BLEAF) {
					*exactp = 1;
					return (&t->bt_cur);
				}
				goto next;
			}
			if (cmp > 0) {
				base = idx + 1;
				--lim;
			}
		}

		/*
		 * If it's a leaf page, we're almost done.  If no duplicates
		 * are allowed, or we have an exact match, we're done.  Else,
		 * it's possible that there were matching keys on this page,
		 * which later deleted, and we're on a page with no matches
		 * while there are matches on other pages.  If at the start or
		 * end of a page, check the adjacent page.
		 */
		if (h->flags & P_BLEAF) {
			if (!F_ISSET(t, B_NODUPS)) {
				if (base == 0 &&
				    h->prevpg != P_INVALID &&
				    __bt_sprev(t, h, key, exactp))
					return (&t->bt_cur);
				if (base == NEXTINDEX(h) &&
				    h->nextpg != P_INVALID &&
				    __bt_snext(t, h, key, exactp))
					return (&t->bt_cur);
			}
			*exactp = 0;
			t->bt_cur.index = base;
			return (&t->bt_cur);
		}

		/*
		 * No match found.  Base is the smallest index greater than
		 * key and may be zero or a last + 1 index.  If it's non-zero,
		 * decrement by one, and record the internal page which should
		 * be a parent page for the key.  If a split later occurs, the
		 * inserted page will be to the right of the saved page.
		 */
		idx = base ? base - 1 : base;

next:		BT_PUSH(t, h->pgno, idx);
		pg = GETBINTERNAL(h, idx)->pgno;
		mpool_put(t->bt_mp, h, 0);
	}
}
Exemplo n.º 22
0
/*
 * __wt_curds_open --
 *	Initialize a data-source cursor.
 */
int
__wt_curds_open(
    WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
    const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    NULL,			/* get-key */
	    NULL,			/* get-value */
	    NULL,			/* set-key */
	    NULL,			/* set-value */
	    __curds_compare,		/* compare */
	    __curds_next,		/* next */
	    __curds_prev,		/* prev */
	    __curds_reset,		/* reset */
	    __curds_search,		/* search */
	    __curds_search_near,	/* search-near */
	    __curds_insert,		/* insert */
	    __curds_update,		/* update */
	    __curds_remove,		/* remove */
	    __curds_close);		/* close */
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor, *source;
	WT_CURSOR_DATA_SOURCE *data_source;
	WT_DECL_RET;
	const char *metaconf;

	STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0);

	data_source = NULL;
	metaconf = NULL;

	WT_RET(__wt_calloc_def(session, 1, &data_source));
	cursor = &data_source->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	F_SET(cursor, WT_CURSTD_DATA_SOURCE);

	/*
	 * XXX
	 * The underlying data-source may require the object's key and value
	 * formats.  This isn't a particularly elegant way of getting that
	 * information to the data-source, this feels like a layering problem
	 * to me.
	 */
	WT_ERR(__wt_metadata_search(session, uri, &metaconf));
	WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format));
	WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval));
	WT_ERR(
	    __wt_strndup(session, cval.str, cval.len, &cursor->value_format));

	WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));

	/* Data-source cursors have a collator reference. */
	WT_ERR(__wt_collator_config(session, cfg, &data_source->collator));

	WT_ERR(dsrc->open_cursor(dsrc,
	    &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source));
	source = data_source->source;
	source->session = (WT_SESSION *)session;
	memset(&source->q, 0, sizeof(source->q));
	source->recno = 0;
	memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
	memset(&source->key, 0, sizeof(source->key));
	memset(&source->value, 0, sizeof(source->value));
	source->saved_err = 0;
	source->flags = 0;

	if (0) {
err:		if (F_ISSET(cursor, WT_CURSTD_OPEN))
			WT_TRET(cursor->close(cursor));
		else
			__wt_free(session, data_source);
		*cursorp = NULL;
	}

	__wt_free(session, metaconf);
	return (ret);
}
Exemplo n.º 23
0
/*
 * cl_read --
 *	Read characters from the input.
 */
static input_t
cl_read(SCR *sp, u_int32_t flags, char *bp, size_t blen, int *nrp, struct timeval *tp)
{
	struct termios term1, term2;
	CL_PRIVATE *clp;
	GS *gp;
	fd_set rdfd;
	input_t rval;
	int maxfd, nr, term_reset;

	gp = sp->gp;
	clp = CLP(sp);
	term_reset = 0;

	/*
	 * 1: A read from a file or a pipe.  In this case, the reads
	 *    never timeout regardless.  This means that we can hang
	 *    when trying to complete a map, but we're going to hang
	 *    on the next read anyway.
	 */
	if (!F_ISSET(clp, CL_STDIN_TTY)) {
		nr = extpread(STDIN_FILENO, bp, blen, O_FBLOCKING, -1);
		switch (nr) {
		case 0:
			return (INP_EOF);
		case -1:
			goto err;
		default:
			*nrp = nr;
			return (INP_OK);
		}
		/* NOTREACHED */
	}

	/*
	 * 2: A read with an associated timeout, e.g., trying to complete
	 *    a map sequence.  If input exists, we fall into #3.
	 */
	if (tp != NULL) {
		FD_ZERO(&rdfd);
		FD_SET(STDIN_FILENO, &rdfd);
		switch (select(STDIN_FILENO + 1, &rdfd, NULL, NULL, tp)) {
		case 0:
			return (INP_TIMEOUT);
		case -1:
			goto err;
		default:
			break;
		}
	}
	
	/*
	 * The user can enter a key in the editor to quote a character.  If we
	 * get here and the next key is supposed to be quoted, do what we can.
	 * Reset the tty so that the user can enter a ^C, ^Q, ^S.  There's an
	 * obvious race here, when the key has already been entered, but there's
	 * nothing that we can do to fix that problem.
	 *
	 * The editor can ask for the next literal character even thought it's
	 * generally running in line-at-a-time mode.  Do what we can.
	 */
	if (LF_ISSET(EC_QUOTED | EC_RAW) && !tcgetattr(STDIN_FILENO, &term1)) {
		term_reset = 1;
		if (LF_ISSET(EC_QUOTED)) {
			term2 = term1;
			term2.c_lflag &= ~ISIG;
			term2.c_iflag &= ~(IXON | IXOFF);
			(void)tcsetattr(STDIN_FILENO,
			    TCSASOFT | TCSADRAIN, &term2);
		} else
			(void)tcsetattr(STDIN_FILENO,
			    TCSASOFT | TCSADRAIN, &clp->vi_enter);
	}

	/*
	 * 3: Wait for input.
	 *
	 * Select on the command input and scripting window file descriptors.
	 * It's ugly that we wait on scripting file descriptors here, but it's
	 * the only way to keep from locking out scripting windows.
	 */
	if (F_ISSET(gp, G_SCRWIN)) {
loop:		FD_ZERO(&rdfd);
		FD_SET(STDIN_FILENO, &rdfd);
		maxfd = STDIN_FILENO;
		if (F_ISSET(sp, SC_SCRIPT)) {
			FD_SET(sp->script->sh_master, &rdfd);
			if (sp->script->sh_master > maxfd)
				maxfd = sp->script->sh_master;
		}
		switch (select(maxfd + 1, &rdfd, NULL, NULL, NULL)) {
		case 0:
			abort();
		case -1:
			goto err;
		default:
			break;
		}
		if (!FD_ISSET(STDIN_FILENO, &rdfd)) {
			if (sscr_input(sp))
				return (INP_ERR);
			goto loop;
		}
	}

	/*
	 * 4: Read the input.
	 *
	 * !!!
	 * What's going on here is some scary stuff.  Ex runs the terminal in
	 * canonical mode.  So, the <newline> character terminating a line of
	 * input is returned in the buffer, but a trailing <EOF> character is
	 * not similarly included.  As ex uses 0<EOF> and ^<EOF> as autoindent
	 * commands, it has to see the trailing <EOF> characters to determine
	 * the difference between the user entering "0ab" and "0<EOF>ab".  We
	 * leave an extra slot in the buffer, so that we can add a trailing
	 * <EOF> character if the buffer isn't terminated by a <newline>.  We
	 * lose if the buffer is too small for the line and exactly N characters
	 * are entered followed by an <EOF> character.
	 */
#define	ONE_FOR_EOF	1
	nr = extpread(STDIN_FILENO, bp, blen - ONE_FOR_EOF, O_FBLOCKING, -1);
	switch (nr) {
	case  0:				/* EOF. */
		/*
		 * ^D in canonical mode returns a read of 0, i.e. EOF.  EOF is
		 * a valid command, but we don't want to loop forever because
		 * the terminal driver is returning EOF because the user has
		 * disconnected. The editor will almost certainly try to write
		 * something before this fires, which should kill us, but You
		 * Never Know.
		 */
		if (++clp->eof_count < 50) {
			bp[0] = clp->orig.c_cc[VEOF];
			*nrp = 1;
			rval = INP_OK;

		} else
			rval = INP_EOF;
		break;
	case -1:				/* Error or interrupt. */
err:		if (errno == EINTR)
			rval = INP_INTR;
		else {
			rval = INP_ERR;
			msgq(sp, M_SYSERR, "input");
		}
		break;
	default:				/* Input characters. */
		if (F_ISSET(sp, SC_EX) && bp[nr - 1] != '\n')
			bp[nr++] = clp->orig.c_cc[VEOF];
		*nrp = nr;
		clp->eof_count = 0;
		rval = INP_OK;
		break;
	}

	/* Restore the terminal state if it was modified. */
	if (term_reset)
		(void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &term1);
	return (rval);
}
Exemplo n.º 24
0
/*
 * __clsm_enter --
 *	Start an operation on an LSM cursor, update if the tree has changed.
 */
static inline int
__clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
{
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	uint64_t *switch_txnp;
	uint64_t snap_min;

	lsm_tree = clsm->lsm_tree;
	session = (WT_SESSION_IMPL *)clsm->iface.session;
	txn = &session->txn;

	/* Merge cursors never update. */
	if (F_ISSET(clsm, WT_CLSM_MERGE))
		return (0);

	if (reset) {
		WT_ASSERT(session, !F_ISSET(&clsm->iface,
		   WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
		WT_RET(__clsm_reset_cursors(clsm, NULL));
	}

	for (;;) {
		/*
		 * If the cursor looks up-to-date, check if the cache is full.
		 * In case this call blocks, the check will be repeated before
		 * proceeding.
		 */
		if (clsm->dsk_gen != lsm_tree->dsk_gen &&
		    lsm_tree->nchunks != 0)
			goto open;

		if (clsm->dsk_gen != lsm_tree->dsk_gen &&
		    lsm_tree->nchunks != 0)
			goto open;

		/* Update the maximum transaction ID in the primary chunk. */
		if (update) {
			/*
			 * Ensure that there is a transaction snapshot active.
			 */
			WT_RET(__wt_txn_autocommit_check(session));
			WT_RET(__wt_txn_id_check(session));

			WT_RET(__clsm_enter_update(clsm));
			if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
				goto open;

			if (txn->isolation == WT_ISO_SNAPSHOT)
				__wt_txn_cursor_op(session);

			/*
			 * Figure out how many updates are required for
			 * snapshot isolation.
			 *
			 * This is not a normal visibility check on the maximum
			 * transaction ID in each chunk: any transaction ID
			 * that overlaps with our snapshot is a potential
			 * conflict.
			 */
			clsm->nupdates = 1;
			if (txn->isolation == WT_ISO_SNAPSHOT &&
			    F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
				WT_ASSERT(session,
				    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
				snap_min = txn->snap_min;
				for (switch_txnp =
				    &clsm->switch_txn[clsm->nchunks - 2];
				    clsm->nupdates < clsm->nchunks;
				    clsm->nupdates++, switch_txnp--) {
					if (WT_TXNID_LT(*switch_txnp, snap_min))
						break;
					WT_ASSERT(session,
					    !__wt_txn_visible_all(
					    session, *switch_txnp));
				}
			}
		}

		/*
		 * Stop when we are up-to-date, as long as this is:
		 *   - a snapshot isolation update and the cursor is set up for
		 *     that;
		 *   - an update operation with a primary chunk, or
		 *   - a read operation and the cursor is open for reading.
		 */
		if ((!update ||
		    txn->isolation != WT_ISO_SNAPSHOT ||
		    F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) &&
		    ((update && clsm->primary_chunk != NULL) ||
		    (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
			break;

open:		WT_WITH_SCHEMA_LOCK(session,
		    ret = __clsm_open_cursors(clsm, update, 0, 0));
		WT_RET(ret);
	}

	if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
		WT_RET(__cursor_enter(session));
		F_SET(clsm, WT_CLSM_ACTIVE);
	}

	return (0);
}
Exemplo n.º 25
0
/*
 * __ckpt_process --
 *	Process the list of checkpoints.
 */
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
	WT_BLOCK_CKPT *a, *b, *ci;
	WT_CKPT *ckpt, *next_ckpt;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint64_t ckpt_size;
	bool deleting, fatal, locked;

	ci = &block->live;
	fatal = locked = false;

#ifdef HAVE_DIAGNOSTIC
	WT_RET(__ckpt_verify(session, ckptbase));
#endif

	/*
	 * Checkpoints are a two-step process: first, write a new checkpoint to
	 * disk (including all the new extent lists for modified checkpoints
	 * and the live system).  As part of this, create a list of file blocks
	 * newly available for reallocation, based on checkpoints being deleted.
	 * We then return the locations of the new checkpoint information to our
	 * caller.  Our caller has to write that information into some kind of
	 * stable storage, and once that's done, we can actually allocate from
	 * that list of newly available file blocks.  (We can't allocate from
	 * that list immediately because the allocation might happen before our
	 * caller saves the new checkpoint information, and if we crashed before
	 * the new checkpoint location was saved, we'd have overwritten blocks
	 * still referenced by checkpoints in the system.)  In summary, there is
	 * a second step: after our caller saves the checkpoint information, we
	 * are called to add the newly available blocks into the live system's
	 * available list.
	 *
	 * This function is the first step, the second step is in the resolve
	 * function.
	 *
	 * If we're called to checkpoint the same file twice (without the second
	 * resolution step), or re-entered for any reason, it's an error in our
	 * caller, and our choices are all bad: leak blocks or potentially crash
	 * with our caller not yet having saved previous checkpoint information
	 * to stable storage.
	 */
	__wt_spin_lock(session, &block->live_lock);
	switch (block->ckpt_state) {
	case WT_CKPT_INPROGRESS:
		block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE;
		break;
	case WT_CKPT_NONE:
	case WT_CKPT_PANIC_ON_FAILURE:
		__wt_err(session, EINVAL,
		    "%s: an unexpected checkpoint attempt: the checkpoint "
		    "was never started or has already completed",
		    block->name);
		ret = __wt_block_panic(session);
		break;
	case WT_CKPT_SALVAGE:
		/* Salvage doesn't use the standard checkpoint APIs. */
		break;
	}
	__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/*
	 * Extents newly available as a result of deleting previous checkpoints
	 * are added to a list of extents.  The list should be empty, but as
	 * described above, there is no "free the checkpoint information" call
	 * into the block manager; if there was an error in an upper level that
	 * resulted in some previous checkpoint never being resolved, the list
	 * may not be empty.  We should have caught that with the "checkpoint
	 * in progress" test, but it doesn't cost us anything to be cautious.
	 *
	 * We free the checkpoint's allocation and discard extent lists as part
	 * of the resolution step, not because they're needed at that time, but
	 * because it's potentially a lot of work, and waiting allows the btree
	 * layer to continue eviction sooner.  As for the checkpoint-available
	 * list, make sure they get cleaned out.
	 */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, "live", "ckpt_avail", true));
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_discard);

	/*
	 * To delete a checkpoint, we'll need checkpoint information for it and
	 * the subsequent checkpoint into which it gets rolled; read them from
	 * disk before we lock things down.
	 */
	deleting = false;
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;
		deleting = true;

		/*
		 * Read the checkpoint and next checkpoint extent lists if we
		 * haven't already read them (we may have already read these
		 * extent blocks if there is more than one deleted checkpoint).
		 */
		if (ckpt->bpriv == NULL)
			WT_ERR(__ckpt_extlist_read(session, block, ckpt));

		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * The "next" checkpoint may be the live tree which has no
		 * extent blocks to read.
		 */
		if (next_ckpt->bpriv == NULL &&
		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
	}

	/*
	 * Failures are now fatal: we can't currently back out the merge of any
	 * deleted checkpoint extent lists into the live system's extent lists,
	 * so continuing after error would leave the live system's extent lists
	 * corrupted for any subsequent checkpoint (and potentially, should a
	 * subsequent checkpoint succeed, for recovery).
	 */
	fatal = true;

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if checkpoints take too
	 * much time away from real work: we read the historic checkpoint
	 * information without a lock, but we could also merge and re-write the
	 * deleted and merged checkpoint information without a lock, except for
	 * the final merge of ranges into the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = true;

	/*
	 * We've allocated our last page, update the checkpoint size.  We need
	 * to calculate the live system's checkpoint size before merging
	 * checkpoint allocation and discard information from the checkpoints
	 * we're deleting, those operations change the underlying byte counts.
	 */
	ckpt_size = ci->ckpt_size;
	ckpt_size += ci->alloc.bytes;
	ckpt_size -= ci->discard.bytes;

	/* Skip the additional processing if we aren't deleting checkpoints. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed checkpoints: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;

		if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(
			    session, block, ckpt->raw.data, tmp));
			__wt_verbose(session, WT_VERB_CHECKPOINT,
			    "%s: delete-checkpoint: %s: %s",
			    block->name, ckpt->name, (const char *)tmp->data);
		}
		/*
		 * Find the checkpoint into which we'll roll this checkpoint's
		 * blocks: it's the next real checkpoint in the list, and it
		 * better have been read in (if it's not the add slot).
		 */
		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * Set the from/to checkpoint structures, where the "to" value
		 * may be the live tree.
		 */
		a = ckpt->bpriv;
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			b = &block->live;
		else
			b = next_ckpt->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the checkpoint's discard list, however, not the live system's
		 * list because it appears on the checkpoint's alloc list and so
		 * must be paired in the checkpoint.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session, block,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" checkpoint's extent
		 * lists, including the avail list.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * checkpoint's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->discard, &b->discard));

		/*
		 * If the "to" checkpoint is also being deleted, we're done with
		 * it, it's merged into some other checkpoint in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * checkpoints, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" checkpoint's
		 * allocate and discard lists overlap, move the range to
		 * the live system's checkpoint available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			continue;

		/*
		 * We have to write the "to" checkpoint's extent lists out in
		 * new blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" checkpoint's extent
		 * lists; don't include the avail list, it's not changing.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

		F_SET(next_ckpt, WT_CKPT_UPDATE);
	}

	/* Update checkpoints marked for update. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_UPDATE))
			WT_ERR(__ckpt_update(
			    session, block, ckpt, ckpt->bpriv, false));

live_update:
	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

	/* Update the final, added checkpoint based on the live system. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
			/*
			 * !!!
			 * Our caller wants the final checkpoint size.  Setting
			 * the size here violates layering, but the alternative
			 * is a call for the btree layer to crack the checkpoint
			 * cookie into its components, and that's a fair amount
			 * of work.
			 */
			ckpt->ckpt_size = ckpt_size;

			/*
			 * Set the rolling checkpoint size for the live system.
			 * The current size includes the current checkpoint's
			 * root page size (root pages are on the checkpoint's
			 * block allocation list as root pages are allocated
			 * with the usual block allocation functions). That's
			 * correct, but we don't want to include it in the size
			 * for the next checkpoint.
			 */
			ckpt_size -= ci->root_size;

			/*
			 * Additionally, we had a bug for awhile where the live
			 * checkpoint size grew without bound. We can't sanity
			 * check the value, that would require walking the tree
			 * as part of the checkpoint. Bound any bug at the size
			 * of the file.
			 * It isn't practical to assert that the value is within
			 * bounds since databases created with older versions
			 * of WiredTiger (2.8.0) would likely see an error.
			 */
			ci->ckpt_size =
			    WT_MIN(ckpt_size, (uint64_t)block->size);

			WT_ERR(__ckpt_update(session, block, ckpt, ci, true));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.  This includes freeing a lot of extents, so do it
	 * outside of the system's lock by copying and resetting the original,
	 * then doing the work later.
	 */
	ci->ckpt_alloc = ci->alloc;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->alloc, "live", "alloc", false));
	ci->ckpt_discard = ci->discard;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->discard, "live", "discard", false));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first checkpoint in the system should always have an empty
	 * discard list.  If we've read that checkpoint and/or created it,
	 * check.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
			break;
	if ((a = ckpt->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0)
		WT_ERR_MSG(session, WT_ERROR,
		    "first checkpoint incorrectly has blocks on the discard "
		    "list");
#endif

err:	if (ret != 0 && fatal) {
		__wt_err(session, ret,
		    "%s: fatal checkpoint failure", block->name);
		ret = __wt_block_panic(session);
	}

	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any checkpoint information we loaded. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if ((ci = ckpt->bpriv) != NULL)
			__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}
Exemplo n.º 26
0
/*
 * __clsm_open_cursors --
 *	Open cursors for the current set of files.
 */
static int
__clsm_open_cursors(
    WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id)
{
	WT_BTREE *btree;
	WT_CURSOR *c, **cp, *primary;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	const char *checkpoint, *ckpt_cfg[3];
	uint64_t saved_gen;
	u_int i, nchunks, ngood, nupdates;
	u_int close_range_end, close_range_start;
	bool locked;

	c = &clsm->iface;
	session = (WT_SESSION_IMPL *)c->session;
	txn = &session->txn;
	chunk = NULL;
	locked = false;
	lsm_tree = clsm->lsm_tree;

	/*
	 * Ensure that any snapshot update has cursors on the right set of
	 * chunks to guarantee visibility is correct.
	 */
	if (update && txn->isolation == WT_ISO_SNAPSHOT)
		F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);

	/*
	 * Query operations need a full set of cursors. Overwrite cursors
	 * do queries in service of updates.
	 */
	if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE))
		F_SET(clsm, WT_CLSM_OPEN_READ);

	if (lsm_tree->nchunks == 0)
		return (0);

	ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
	ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
	ckpt_cfg[2] = NULL;

	/*
	 * If the key is pointing to memory that is pinned by a chunk
	 * cursor, take a copy before closing cursors.
	 */
	if (F_ISSET(c, WT_CURSTD_KEY_INT))
		WT_CURSOR_NEEDKEY(c);

	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);

	WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
	locked = true;

	/* Merge cursors have already figured out how many chunks they need. */
retry:	if (F_ISSET(clsm, WT_CLSM_MERGE)) {
		nchunks = clsm->nchunks;
		ngood = 0;

		/*
		 * We may have raced with another merge completing.  Check that
		 * we're starting at the right offset in the chunk array.
		 */
		if (start_chunk >= lsm_tree->nchunks ||
		    lsm_tree->chunk[start_chunk]->id != start_id) {
			for (start_chunk = 0;
			    start_chunk < lsm_tree->nchunks;
			    start_chunk++) {
				chunk = lsm_tree->chunk[start_chunk];
				if (chunk->id == start_id)
					break;
			}
			/* We have to find the start chunk: merge locked it. */
			WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
		}

		WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
	} else {
		nchunks = lsm_tree->nchunks;

		/*
		 * If we are only opening the cursor for updates, only open the
		 * primary chunk, plus any other chunks that might be required
		 * to detect snapshot isolation conflicts.
		 */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			WT_ERR(__wt_realloc_def(session,
			    &clsm->txnid_alloc, nchunks,
			    &clsm->switch_txn));
		if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
			ngood = nupdates = 0;
		else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
			/*
			 * Keep going until all updates in the next
			 * chunk are globally visible.  Copy the maximum
			 * transaction IDs into the cursor as we go.
			 */
			for (ngood = nchunks - 1, nupdates = 1;
			    ngood > 0;
			    ngood--, nupdates++) {
				chunk = lsm_tree->chunk[ngood - 1];
				clsm->switch_txn[ngood - 1] = chunk->switch_txn;
				if (__wt_txn_visible_all(
				    session, chunk->switch_txn))
					break;
			}
		} else {
			nupdates = 1;
			ngood = nchunks - 1;
		}

		/* Check how many cursors are already open. */
		for (cp = clsm->cursors + ngood;
		    ngood < clsm->nchunks && ngood < nchunks;
		    cp++, ngood++) {
			chunk = lsm_tree->chunk[ngood];

			/* If the cursor isn't open yet, we're done. */
			if (*cp == NULL)
				break;

			/* Easy case: the URIs don't match. */
			if (strcmp((*cp)->uri, chunk->uri) != 0)
				break;

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			if (checkpoint == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty)
				break;

			/* Make sure the Bloom config matches. */
			if (clsm->blooms[ngood] == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
				break;
		}

		/* Spurious generation bump? */
		if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
			clsm->dsk_gen = lsm_tree->dsk_gen;
			goto err;
		}

		/*
		 * Close any cursors we no longer need.
		 *
		 * Drop the LSM tree lock while we do this: if the cache is
		 * full, we may block while closing a cursor.  Save the
		 * generation number and retry if it has changed under us.
		 */
		if (clsm->cursors != NULL && ngood < clsm->nchunks) {
			close_range_start = ngood;
			close_range_end = clsm->nchunks;
		} else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) {
			close_range_start = 0;
			close_range_end = WT_MIN(nchunks, clsm->nchunks);
			if (close_range_end > nupdates)
				close_range_end -= nupdates;
			else
				close_range_end = 0;
			WT_ASSERT(session, ngood >= close_range_end);
		} else {
			close_range_end = 0;
			close_range_start = 0;
		}
		if (close_range_end > close_range_start) {
			saved_gen = lsm_tree->dsk_gen;
			locked = false;
			WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree));
			WT_ERR(__clsm_close_cursors(
			    clsm, close_range_start, close_range_end));
			WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
			locked = true;
			if (lsm_tree->dsk_gen != saved_gen)
				goto retry;
		}

		/* Detach from our old primary. */
		clsm->primary_chunk = NULL;
		clsm->current = NULL;
	}

	WT_ERR(__wt_realloc_def(session,
	    &clsm->bloom_alloc, nchunks, &clsm->blooms));
	WT_ERR(__wt_realloc_def(session,
	    &clsm->cursor_alloc, nchunks, &clsm->cursors));

	clsm->nchunks = nchunks;

	/* Open the cursors for chunks that have changed. */
	for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
		chunk = lsm_tree->chunk[i + start_chunk];
		/* Copy the maximum transaction ID. */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			clsm->switch_txn[i] = chunk->switch_txn;

		/*
		 * Read from the checkpoint if the file has been written.
		 * Once all cursors switch, the in-memory tree can be evicted.
		 */
		WT_ASSERT(session, *cp == NULL);
		ret = __wt_open_cursor(session, chunk->uri, c,
		    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
			ckpt_cfg : NULL, cp);

		/*
		 * XXX kludge: we may have an empty chunk where no checkpoint
		 * was written.  If so, try to open the ordinary handle on that
		 * chunk instead.
		 */
		if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
			ret = __wt_open_cursor(
			    session, chunk->uri, c, NULL, cp);
			if (ret == 0)
				chunk->empty = 1;
		}
		WT_ERR(ret);

		/*
		 * Setup all cursors other than the primary to only do conflict
		 * checks on insert operations. This allows us to execute
		 * inserts on non-primary chunks as a way of checking for
		 * write conflicts with concurrent updates.
		 */
		if (i != nchunks - 1)
			(*cp)->insert = __wt_curfile_update_check;

		if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
		    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
			    lsm_tree->bloom_bit_count,
			    lsm_tree->bloom_hash_count,
			    c, &clsm->blooms[i]));

		/* Child cursors always use overwrite and raw mode. */
		F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
	}

	/* The last chunk is our new primary. */
	if (chunk != NULL &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    chunk->switch_txn == WT_TXN_NONE) {
		clsm->primary_chunk = chunk;
		primary = clsm->cursors[clsm->nchunks - 1];
		/*
		 * Disable eviction for the in-memory chunk.  Also clear the
		 * bulk load flag here, otherwise eviction will be enabled by
		 * the first update.
		 */
		btree = ((WT_CURSOR_BTREE *)(primary))->btree;
		if (btree->bulk_load_ok) {
			btree->bulk_load_ok = false;
			WT_WITH_BTREE(session, btree,
			    __wt_btree_evictable(session, false));
		}
	}

	clsm->dsk_gen = lsm_tree->dsk_gen;

err:	
#ifdef HAVE_DIAGNOSTIC
	/* Check that all cursors are open as expected. */
	if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
		for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
			chunk = lsm_tree->chunk[i + start_chunk];

			/* Make sure the cursor is open. */
			WT_ASSERT(session, *cp != NULL);

			/* Easy case: the URIs should match. */
			WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty) ?
			    checkpoint != NULL : checkpoint == NULL);

			/* Make sure the Bloom config matches. */
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
			    !F_ISSET(clsm, WT_CLSM_MERGE)) ?
			    clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
		}
	}
#endif
	if (locked)
		WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
	return (ret);
}
Exemplo n.º 27
0
/*
 * __wt_txn_commit --
 *	Commit the current transaction.
 */
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_OP *op;
	u_int i;
	bool locked, readonly;
#ifdef HAVE_TIMESTAMPS
	wt_timestamp_t prev_commit_timestamp, ts;
	bool update_timestamp;
#endif

	txn = &session->txn;
	conn = S2C(session);
	txn_global = &conn->txn_global;
	locked = false;

	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) ||
	    txn->mod_count == 0);

	readonly = txn->mod_count == 0;
	/*
	 * Look for a commit timestamp.
	 */
	WT_ERR(
	    __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval));
	if (cval.len != 0) {
#ifdef HAVE_TIMESTAMPS
		WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
		WT_ERR(__wt_timestamp_validate(session,
		    "commit", &ts, &cval, true, true, true));
		__wt_timestamp_set(&txn->commit_timestamp, &ts);
		__wt_txn_set_commit_timestamp(session);
#else
		WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a "
		    "version of WiredTiger built with timestamp support");
#endif
	}

#ifdef HAVE_TIMESTAMPS
	/*
	 * Debugging checks on timestamps, if user requested them.
	 */
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) &&
	    !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "commit_timestamp required and "
		    "none set on this transaction");
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) &&
	    F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and "
		    "timestamp set on this transaction");
#endif
	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));

	/*
	 * If the user chose the default setting, check whether sync is enabled
	 * for this transaction (either inherited or via begin_transaction).
	 * If sync is disabled, clear the field to avoid the log write being
	 * flushed.
	 *
	 * Otherwise check for specific settings.  We don't need to check for
	 * "on" because that is the default inherited from the connection.  If
	 * the user set anything in begin_transaction, we only override with an
	 * explicit setting.
	 */
	if (cval.len == 0) {
		if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
		    !F_ISSET(txn, WT_TXN_SYNC_SET))
			txn->txn_logsync = 0;
	} else {
		/*
		 * If the caller already set sync on begin_transaction then
		 * they should not be using sync on commit_transaction.
		 * Flag that as an error.
		 */
		if (F_ISSET(txn, WT_TXN_SYNC_SET))
			WT_ERR_MSG(session, EINVAL,
			    "Sync already set during begin_transaction");
		if (WT_STRING_MATCH("background", cval.str, cval.len))
			txn->txn_logsync = WT_LOG_BACKGROUND;
		else if (WT_STRING_MATCH("off", cval.str, cval.len))
			txn->txn_logsync = 0;
		/*
		 * We don't need to check for "on" here because that is the
		 * default to inherit from the connection setting.
		 */
	}

	/* Commit notification. */
	if (txn->notify != NULL)
		WT_ERR(txn->notify->notify(txn->notify,
		    (WT_SESSION *)session, txn->id, 1));

	/*
	 * We are about to release the snapshot: copy values into any
	 * positioned cursors so they don't point to updates that could be
	 * freed once we don't have a snapshot.
	 */
	if (session->ncursors > 0) {
		WT_DIAGNOSTIC_YIELD;
		WT_ERR(__wt_session_copy_values(session));
	}

	/* If we are logging, write a commit log record. */
	if (txn->logrec != NULL &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
		/*
		 * We are about to block on I/O writing the log.
		 * Release our snapshot in case it is keeping data pinned.
		 * This is particularly important for checkpoints.
		 */
		__wt_txn_release_snapshot(session);
		/*
		 * We hold the visibility lock for reading from the time
		 * we write our log record until the time we release our
		 * transaction so that the LSN any checkpoint gets will
		 * always reflect visible data.
		 */
		__wt_readlock(session, &txn_global->visibility_rwlock);
		locked = true;
		WT_ERR(__wt_txn_log_commit(session, cfg));
	}

	/* Note: we're going to commit: nothing can fail after this point. */

	/* Process and free updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
		switch (op->type) {
		case WT_TXN_OP_BASIC:
		case WT_TXN_OP_BASIC_TS:
		case WT_TXN_OP_INMEM:
			/*
			 * Switch reserved operations to abort to
			 * simplify obsolete update list truncation.
			 */
			if (op->u.upd->type == WT_UPDATE_RESERVED) {
				op->u.upd->txnid = WT_TXN_ABORTED;
				break;
			}

			/*
			 * Writes to the lookaside file can be evicted as soon
			 * as they commit.
			 */
			if (conn->cache->las_fileid != 0 &&
			    op->fileid == conn->cache->las_fileid) {
				op->u.upd->txnid = WT_TXN_NONE;
				break;
			}

#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
			    op->type != WT_TXN_OP_BASIC_TS) {
				WT_ASSERT(session,
				    op->fileid != WT_METAFILE_ID);
				__wt_timestamp_set(&op->u.upd->timestamp,
				    &txn->commit_timestamp);
			}
#endif
			break;

		case WT_TXN_OP_REF:
#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
				__wt_timestamp_set(
				    &op->u.ref->page_del->timestamp,
				    &txn->commit_timestamp);
#endif
			break;

		case WT_TXN_OP_TRUNCATE_COL:
		case WT_TXN_OP_TRUNCATE_ROW:
			/* Other operations don't need timestamps. */
			break;
		}

		__wt_txn_op_free(session, op);
	}
	txn->mod_count = 0;

#ifdef HAVE_TIMESTAMPS
	/*
	 * Track the largest commit timestamp we have seen.
	 *
	 * We don't actually clear the local commit timestamp, just the flag.
	 * That said, we can't update the global commit timestamp until this
	 * transaction is visible, which happens when we release it.
	 */
	update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
#endif

	__wt_txn_release(session);
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);

#ifdef HAVE_TIMESTAMPS
	/* First check if we've already committed something in the future. */
	if (update_timestamp) {
		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp));
		update_timestamp = __wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0;
	}

	/*
	 * If it looks like we need to move the global commit timestamp,
	 * write lock and re-check.
	 */
	if (update_timestamp) {
#if WT_TIMESTAMP_SIZE == 8
		while (__wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0) {
			if (__wt_atomic_cas64(
			    &txn_global->commit_timestamp.val,
			    prev_commit_timestamp.val,
			    txn->commit_timestamp.val)) {
				txn_global->has_commit_timestamp = true;
				break;
			}
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp);
		}
#else
		__wt_writelock(session, &txn_global->rwlock);
		if (__wt_timestamp_cmp(&txn->commit_timestamp,
		    &txn_global->commit_timestamp) > 0) {
			__wt_timestamp_set(&txn_global->commit_timestamp,
			    &txn->commit_timestamp);
			txn_global->has_commit_timestamp = true;
		}
		__wt_writeunlock(session, &txn_global->rwlock);
#endif
	}
#endif

	/*
	 * We're between transactions, if we need to block for eviction, it's
	 * a good time to do so.  Note that we must ignore any error return
	 * because the user's data is committed.
	 */
	if (!readonly)
		(void)__wt_cache_eviction_check(session, false, false, NULL);
	return (0);

err:	/*
	 * If anything went wrong, roll back.
	 *
	 * !!!
	 * Nothing can fail after this point.
	 */
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);
	WT_TRET(__wt_txn_rollback(session, cfg));
	return (ret);
}
Exemplo n.º 28
0
/*
 * __clsm_enter_update --
 *	Make sure an LSM cursor is ready to perform an update.
 */
static int
__clsm_enter_update(WT_CURSOR_LSM *clsm)
{
	WT_CURSOR *primary;
	WT_LSM_CHUNK *primary_chunk;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	bool hard_limit, have_primary, ovfl;

	lsm_tree = clsm->lsm_tree;
	ovfl = false;
	session = (WT_SESSION_IMPL *)clsm->iface.session;

	if (clsm->nchunks == 0) {
		primary = NULL;
		have_primary = false;
	} else {
		primary = clsm->cursors[clsm->nchunks - 1];
		primary_chunk = clsm->primary_chunk;
		WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID));
		have_primary = (primary != NULL && primary_chunk != NULL &&
		    (primary_chunk->switch_txn == WT_TXN_NONE ||
		    WT_TXNID_LT(session->txn.id, primary_chunk->switch_txn)));
	}

	/*
	 * In LSM there are multiple btrees active at one time. The tree
	 * switch code needs to use btree API methods, and it wants to
	 * operate on the btree for the primary chunk. Set that up now.
	 *
	 * If the primary chunk has grown too large, set a flag so the worker
	 * thread will switch when it gets a chance to avoid introducing high
	 * latency into application threads.  Don't do this indefinitely: if a
	 * chunk grows twice as large as the configured size, block until it
	 * can be switched.
	 */
	hard_limit = F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);

	if (have_primary) {
		WT_ENTER_PAGE_INDEX(session);
		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
		    ovfl = __wt_btree_lsm_over_size(session, hard_limit ?
		    2 * lsm_tree->chunk_size : lsm_tree->chunk_size));
		WT_LEAVE_PAGE_INDEX(session);

		/* If there was no overflow, we're done. */
		if (!ovfl)
			return (0);
	}

	/* Request a switch. */
	WT_RET(__wt_clsm_request_switch(clsm));

	/* If we only overflowed the soft limit, we're done. */
	if (have_primary && !hard_limit)
		return (0);

	WT_RET(__wt_clsm_await_switch(clsm));

	return (0);
}
Exemplo n.º 29
0
/*
 * cl_attr --
 *	Toggle a screen attribute on/off.
 *
 * PUBLIC: int cl_attr __P((SCR *, scr_attr_t, int));
 */
int
cl_attr(SCR *sp, scr_attr_t attribute, int on)
{
	CL_PRIVATE *clp;
	WINDOW *win;

	clp = CLP(sp);
	win = CLSP(sp) ? CLSP(sp) : stdscr;

	switch (attribute) {
	case SA_ALTERNATE:
	/*
	 * !!!
	 * There's a major layering violation here.  The problem is that the
	 * X11 xterm screen has what's known as an "alternate" screen.  Some
	 * xterm termcap/terminfo entries include sequences to switch to/from
	 * that alternate screen as part of the ti/te (smcup/rmcup) strings.
	 * Vi runs in the alternate screen, so that you are returned to the
	 * same screen contents on exit from vi that you had when you entered
	 * vi.  Further, when you run :shell, or :!date or similar ex commands,
	 * you also see the original screen contents.  This wasn't deliberate
	 * on vi's part, it's just that it historically sent terminal init/end
	 * sequences at those times, and the addition of the alternate screen
	 * sequences to the strings changed the behavior of vi.  The problem
	 * caused by this is that we don't want to switch back to the alternate
	 * screen while getting a new command from the user, when the user is
	 * continuing to enter ex commands, e.g.:
	 *
	 *	:!date				<<< switch to original screen
	 *	[Hit return to continue]	<<< prompt user to continue
	 *	:command			<<< get command from user
	 *
	 * Note that the :command input is a true vi input mode, e.g., input
	 * maps and abbreviations are being done.  So, we need to be able to
	 * switch back into the vi screen mode, without flashing the screen. 
	 *
	 * To make matters worse, the curses initscr() and endwin() calls will
	 * do this automatically -- so, this attribute isn't as controlled by
	 * the higher level screen as closely as one might like.
	 */
	if (on) {
		if (clp->ti_te != TI_SENT) {
			clp->ti_te = TI_SENT;
			if (clp->smcup == NULL)
				(void)cl_getcap(sp, "smcup", &clp->smcup);
			if (clp->smcup != NULL)
				(void)tputs(clp->smcup, 1, cl_putchar);
		}
	} else
		if (clp->ti_te != TE_SENT) {
			clp->ti_te = TE_SENT;
			if (clp->rmcup == NULL)
				(void)cl_getcap(sp, "rmcup", &clp->rmcup);
			if (clp->rmcup != NULL)
				(void)tputs(clp->rmcup, 1, cl_putchar);
			(void)fflush(stdout);
		}
		(void)fflush(stdout);
		break;
	case SA_INVERSE:
		if (F_ISSET(sp, SC_EX | SC_SCR_EXWROTE)) {
			if (clp->smso == NULL)
				return (1);
			if (on)
				(void)tputs(clp->smso, 1, cl_putchar);
			else
				(void)tputs(clp->rmso, 1, cl_putchar);
			(void)fflush(stdout);
		} else {
			if (on)
				(void)wstandout(win);
			else
				(void)wstandend(win);
		}
		break;
	default:
		abort();
	}
	return (0);
}
Exemplo n.º 30
0
/*
 * __wt_btcur_search_near --
 *	Search for a record in the tree.
 */
int
__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	int exact;
	bool valid;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	upd = NULL;					/* -Wuninitialized */
	exact = 0;

	WT_STAT_CONN_INCR(session, cursor_search_near);
	WT_STAT_DATA_INCR(session, cursor_search_near);

	/*
	 * If we have a row-store page pinned, search it; if we don't have a
	 * page pinned, or the search of the pinned page doesn't find an exact
	 * match, search from the root. Unlike WT_CURSOR.search, ignore pinned
	 * pages in the case of column-store, search-near isn't an interesting
	 * enough case for column-store to add the complexity needed to avoid
	 * the tree search.
	 *
	 * Set the "insert" flag for the btree row-store search; we may intend
	 * to position the cursor at the end of the tree, rather than match an
	 * existing record.
	 */
	valid = false;
	if (btree->type == BTREE_ROW &&
	    F_ISSET(cbt, WT_CBT_ACTIVE) &&
	    cbt->ref->page->read_gen != WT_READGEN_OLDEST) {
		WT_ERR(__wt_txn_cursor_op(session));

		WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true));

		/*
		 * Search-near is trickier than search when searching an already
		 * pinned page. If search returns the first or last page slots,
		 * discard the results and search the full tree as the neighbor
		 * pages might offer better matches. This test is simplistic as
		 * we're ignoring append lists (there may be no page slots or we
		 * might be legitimately positioned after the last page slot).
		 * Ignore those cases, it makes things too complicated.
		 */
		if (cbt->slot != 0 &&
		    cbt->slot != cbt->ref->page->pg_row_entries - 1)
			valid = __cursor_valid(cbt, &upd);
	}
	if (!valid) {
		WT_ERR(__cursor_func_init(cbt, true));
		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, true) :
		    __cursor_col_search(session, cbt, NULL));
		valid = __cursor_valid(cbt, &upd);
	}

	/*
	 * If we find a valid key, return it.
	 *
	 * Else, creating a record past the end of the tree in a fixed-length
	 * column-store implicitly fills the gap with empty records.  In this
	 * case, we instantiate the empty record, it's an exact match.
	 *
	 * Else, move to the next key in the tree (bias for prefix searches).
	 * Cursor next skips invalid rows, so we don't have to test for them
	 * again.
	 *
	 * Else, redo the search and move to the previous key in the tree.
	 * Cursor previous skips invalid rows, so we don't have to test for
	 * them again.
	 *
	 * If that fails, quit, there's no record to return.
	 */
	if (valid) {
		exact = cbt->compare;
		ret = __wt_kv_return(session, cbt, upd);
	} else if (__cursor_fix_implicit(btree, cbt)) {
		cbt->recno = cursor->recno;
		cbt->v = 0;
		cursor->value.data = &cbt->v;
		cursor->value.size = 1;
		exact = 0;
	} else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND)
		exact = 1;
	else {
		WT_ERR(__cursor_func_init(cbt, true));
		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, true) :
		    __cursor_col_search(session, cbt, NULL));
		if (__cursor_valid(cbt, &upd)) {
			exact = cbt->compare;
			ret = __wt_kv_return(session, cbt, upd);
		} else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND)
			exact = -1;
	}

#ifdef HAVE_DIAGNOSTIC
	if (ret == 0)
		WT_ERR(__wt_cursor_key_order_init(session, cbt));
#endif

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND))
		*exactp = exact;
	return (ret);
}