예제 #1
0
파일: bt_cursor.c 프로젝트: 3rf/mongo
/*
 * __wt_btcur_compare --
 *	Return a comparison between two cursors.
 */
int
__wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
{
	WT_BTREE *btree;
	WT_CURSOR *a, *b;
	WT_SESSION_IMPL *session;

	a = (WT_CURSOR *)a_arg;
	b = (WT_CURSOR *)b_arg;
	btree = a_arg->btree;
	session = (WT_SESSION_IMPL *)a->session;

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		/*
		 * Compare the interface's cursor record, not the underlying
		 * cursor reference: the interface's cursor reference is the
		 * one being returned to the application.
		 */
		if (a->recno < b->recno)
			*cmpp = -1;
		else if (a->recno == b->recno)
			*cmpp = 0;
		else
			*cmpp = 1;
		break;
	case BTREE_ROW:
		WT_RET(__wt_compare(
		    session, btree->collator, &a->key, &b->key, cmpp));
		break;
	WT_ILLEGAL_VALUE(session);
	}
	return (0);
}
예제 #2
0
파일: bt_stat.c 프로젝트: sylvanchil/mongo
/*
 * __stat_page --
 *	Stat any Btree page.
 */
static int
__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
{
	/*
	 * All internal pages and overflow pages are trivial, all we track is
	 * a count of the page type.
	 */
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		WT_STAT_INCR(session, stats, btree_column_fix);
		WT_STAT_INCRV(
		    session, stats, btree_entries, page->pg_fix_entries);
		break;
	case WT_PAGE_COL_INT:
		WT_STAT_INCR(session, stats, btree_column_internal);
		break;
	case WT_PAGE_COL_VAR:
		__stat_page_col_var(session, page, stats);
		break;
	case WT_PAGE_ROW_INT:
		__stat_page_row_int(session, page, stats);
		break;
	case WT_PAGE_ROW_LEAF:
		__stat_page_row_leaf(session, page, stats);
		break;
	WT_ILLEGAL_VALUE(session);
	}
	return (0);
}
예제 #3
0
/*
 * wiredtiger_pack_str --
 *	Pack a string.
 */
int
wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s)
{
	WT_DECL_PACK_VALUE(pv);
	WT_SESSION_IMPL *session;

	session = ps->pack.session;

	/* Lower-level packing routines treat a length of zero as unchecked. */
	if (ps->p >= ps->end)
		return (ENOMEM);

	WT_RET(__pack_next(&ps->pack, &pv));
	switch (pv.type) {
	case 'S':
	case 's':
		pv.u.s = s;
		WT_RET(__pack_write(
		    session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
		break;
	WT_ILLEGAL_VALUE(session);
	}

	return (0);
}
예제 #4
0
/*
 * wiredtiger_pack_item --
 *	Pack an item.
 */
int
wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
{
	WT_DECL_PACK_VALUE(pv);
	WT_SESSION_IMPL *session;

	session = ps->pack.session;

	/* Lower-level packing routines treat a length of zero as unchecked. */
	if (ps->p >= ps->end)
		return (ENOMEM);

	WT_RET(__pack_next(&ps->pack, &pv));
	switch (pv.type) {
	case 'U':
	case 'u':
		pv.u.item.data = item->data;
		pv.u.item.size = item->size;
		WT_RET(__pack_write(
		    session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
		break;
	WT_ILLEGAL_VALUE(session);
	}

	return (0);
}
예제 #5
0
/*
 * wiredtiger_unpack_uint --
 *	Unpack an unsigned integer.
 */
int
wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up)
{
	WT_DECL_PACK_VALUE(pv);
	WT_SESSION_IMPL *session;

	session = ps->pack.session;

	/* Lower-level packing routines treat a length of zero as unchecked. */
	if (ps->p >= ps->end)
		return (ENOMEM);

	WT_RET(__pack_next(&ps->pack, &pv));
	switch (pv.type) {
	case 'B':
	case 'H':
	case 'I':
	case 'L':
	case 'Q':
	case 'R':
	case 'r':
	case 't':
		WT_RET(__unpack_read(session,
		    &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
		*up = pv.u.u;
		break;
	WT_ILLEGAL_VALUE(session);
	}
	return (0);
}
예제 #6
0
파일: bt_sync.c 프로젝트: BobbWu/wiredtiger
/*
 * __wt_cache_op --
 *	Cache operations.
 */
int
__wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op)
{
	switch (op) {
	case WT_SYNC_CHECKPOINT:
	case WT_SYNC_CLOSE:
		/*
		 * Make sure the checkpoint reference is set for
		 * reconciliation; it's ugly, but drilling a function parameter
		 * path from our callers to the reconciliation of the tree's
		 * root page is going to be worse.
		 */
		WT_ASSERT(session, S2BT(session)->ckpt != NULL);
		break;
	case WT_SYNC_DISCARD:
	case WT_SYNC_WRITE_LEAVES:
		break;
	}

	switch (op) {
	case WT_SYNC_CHECKPOINT:
	case WT_SYNC_WRITE_LEAVES:
		return (__sync_file(session, op));
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
		return (__wt_evict_file(session, op));
	WT_ILLEGAL_VALUE(session);
	}
}
예제 #7
0
/*
 * __wt_debug_disk --
 *	Dump a disk page in debugging mode.
 */
int
__wt_debug_disk(
    WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile)
{
	WT_DBG *ds, _ds;

	ds = &_ds;
	WT_RET(__debug_config(session, ds, ofile));

	WT_RET(ds->f(ds, "%s page", __wt_page_type_string(dsk->type)));
	switch (dsk->type) {
	case WT_PAGE_BLOCK_MANAGER:
		break;
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
		WT_RET(ds->f(ds, ", recno %" PRIu64, dsk->recno));
		/* FALLTHROUGH */
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		WT_RET(ds->f(ds, ", entries %" PRIu32, dsk->u.entries));
		break;
	case WT_PAGE_OVFL:
		WT_RET(ds->f(ds, ", datalen %" PRIu32, dsk->u.datalen));
		break;
	WT_ILLEGAL_VALUE(session);
	}

	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
		WT_RET(ds->f(ds, ", compressed"));
	if (F_ISSET(dsk, WT_PAGE_ENCRYPTED))
		WT_RET(ds->f(ds, ", encrypted"));
	if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL))
		WT_RET(ds->f(ds, ", empty-all"));
	if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
		WT_RET(ds->f(ds, ", empty-none"));
	if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE))
		WT_RET(ds->f(ds, ", LAS-update"));

	WT_RET(ds->f(ds, ", generation %" PRIu64 "\n", dsk->write_gen));

	switch (dsk->type) {
	case WT_PAGE_BLOCK_MANAGER:
		break;
	case WT_PAGE_COL_FIX:
		WT_RET(__debug_dsk_col_fix(ds, dsk));
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		WT_RET(__debug_dsk_cell(ds, dsk));
		break;
	default:
		break;
	}

	return (__dmsg_wrapup(ds));
}
예제 #8
0
/*
 * __wt_btcur_update --
 *	Update a record in the tree.
 */
int
__wt_btcur_update(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_SESSION_IMPL *session;
	int ret;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	WT_BSTAT_INCR(session, cursor_updates);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));
	WT_RET(__cursor_size_chk(session, &cursor->value));

retry:	__cursor_func_init(cbt, 1);

	switch (btree->type) {
	case BTREE_COL_FIX:
		if (cursor->value.size != 1)
			WT_RET_MSG(session, EINVAL,
			    "item size of %" PRIu32 " does not match "
			    "fixed-length file requirement of 1 byte",
			    cursor->value.size);
		/* FALLTHROUGH */
	case BTREE_COL_VAR:
		WT_ERR(__wt_col_search(session, cbt, 1));

		/*
		 * Update the record if it exists.  Creating a record past the
		 * end of the tree in a fixed-length column-store implicitly
		 * fills the gap with empty records.  Update the record in that
		 * case, the record exists.
		 */
		if ((cbt->compare != 0 || __cursor_invalid(cbt)) &&
		    !__cursor_fix_implicit(btree, cbt))
			ret = WT_NOTFOUND;
		else if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART)
			goto retry;
		break;
	case BTREE_ROW:
		/* Update the record it it exists. */
		WT_ERR(__wt_row_search(session, cbt, 1));
		if (cbt->compare != 0 || __cursor_invalid(cbt))
			ret = WT_NOTFOUND;
		else if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART)
			goto retry;
		break;
	WT_ILLEGAL_VALUE(session);
	}

err:	__cursor_func_resolve(cbt, ret);

	return (ret);
}
예제 #9
0
/*
 * __rec_page_dirty_update --
 *	Update a dirty page's reference on eviction.
 */
static int
__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_ADDR *addr;
	WT_PAGE_MODIFY *mod;
	WT_REF *parent_ref;

	mod = page->modify;
	parent_ref = page->ref;

	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
	case WT_PM_REC_REPLACE: 			/* 1-for-1 page swap */
		if (parent_ref->addr != NULL &&
		    __wt_off_page(page->parent, parent_ref->addr)) {
			__wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr);
			__wt_free(session, parent_ref->addr);
		}

		/*
		 * Update the parent to reference the replacement page.
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
		*addr = mod->u.replace;
		mod->u.replace.addr = NULL;
		mod->u.replace.size = 0;

		parent_ref->page = NULL;
		parent_ref->addr = addr;
		WT_PUBLISH(parent_ref->state, WT_REF_DISK);
		break;
	case WT_PM_REC_SPLIT:				/* Page split */
		/*
		 * Update the parent to reference new internal page(s).
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		parent_ref->page = mod->u.split;
		WT_PUBLISH(parent_ref->state, WT_REF_MEM);

		/* Clear the reference else discarding the page will free it. */
		mod->u.split = NULL;
		F_CLR(mod, WT_PM_REC_SPLIT);
		break;
	case WT_PM_REC_EMPTY:				/* Page is empty */
		/* We checked if the page was empty when we reviewed it. */
		/* FALLTHROUGH */
	WT_ILLEGAL_VALUE(session);
	}

	return (0);
}
예제 #10
0
파일: bt_curnext.c 프로젝트: DINKIN/mongo
/*
 * __wt_cursor_key_order_check --
 *	Check key ordering for cursor movements.
 */
int
__wt_cursor_key_order_check(
    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
{
	switch (cbt->ref->page->type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_VAR:
		return (__cursor_key_order_check_col(session, cbt, next));
	case WT_PAGE_ROW_LEAF:
		return (__cursor_key_order_check_row(session, cbt, next));
	WT_ILLEGAL_VALUE(session);
	}
	/* NOTREACHED */
}
예제 #11
0
/*
 * __wt_btcur_remove --
 *	Remove a record from the tree.
 */
int
__wt_btcur_remove(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_SESSION_IMPL *session;
	int ret;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	WT_BSTAT_INCR(session, cursor_removes);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));

retry:	__cursor_func_init(cbt, 1);

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		WT_ERR(__wt_col_search(session, cbt, 1));

		/*
		 * Remove the record if it exists.  Creating a record past the
		 * end of the tree in a fixed-length column-store implicitly
		 * fills the gap with empty records.  Return success in that
		 * case, the record was deleted successfully.
		 */
		if (cbt->compare != 0 || __cursor_invalid(cbt))
			ret =
			    __cursor_fix_implicit(btree, cbt) ? 0 : WT_NOTFOUND;
		else if ((ret = __wt_col_modify(session, cbt, 2)) == WT_RESTART)
			goto retry;
		break;
	case BTREE_ROW:
		/* Remove the record if it exists. */
		WT_ERR(__wt_row_search(session, cbt, 1));
		if (cbt->compare != 0 || __cursor_invalid(cbt))
			ret = WT_NOTFOUND;
		else if ((ret = __wt_row_modify(session, cbt, 1)) == WT_RESTART)
			goto retry;
		break;
	WT_ILLEGAL_VALUE(session);
	}

err:	__cursor_func_resolve(cbt, ret);

	return (ret);
}
예제 #12
0
파일: bt_ovfl.c 프로젝트: qihsh/mongo
/*
 * __wt_ovfl_discard --
 *	Discard an on-page overflow value, and reset the page's cell.
 */
int
__wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_RET;

	btree = S2BT(session);
	bm = btree->bm;
	unpack = &_unpack;

	__wt_cell_unpack(cell, unpack);

	/*
	 * Finally remove overflow key/value objects, called when reconciliation
	 * finishes after successfully writing a page.
	 *
	 * Keys must have already been instantiated and value objects must have
	 * already been cached (if they might potentially still be read by any
	 * running transaction).
	 *
	 * Acquire the overflow lock to avoid racing with a thread reading the
	 * backing overflow blocks.
	 */
	WT_RET(__wt_writelock(session, btree->ovfl_lock));

	switch (unpack->raw) {
	case WT_CELL_KEY_OVFL:
		__wt_cell_type_reset(session,
		    unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM);
		break;
	case WT_CELL_VALUE_OVFL:
		__wt_cell_type_reset(session,
		    unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	WT_TRET(__wt_writeunlock(session, btree->ovfl_lock));

	/* Free the backing disk blocks. */
	WT_TRET(bm->free(bm, session, unpack->data, unpack->size));

	return (ret);
}
예제 #13
0
파일: txn_log.c 프로젝트: qihsh/mongo
/*
 * __wt_txn_log_op --
 *	Write the last logged operation into the in-memory buffer.
 */
int
__wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
	WT_ITEM *logrec;
	WT_TXN *txn;
	WT_TXN_OP *op;

	txn = &session->txn;

	if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) ||
	    F_ISSET(session, WT_SESSION_NO_LOGGING) ||
	    F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING))
		return (0);

	/* We'd better have a transaction. */
	WT_ASSERT(session,
	    F_ISSET(txn, WT_TXN_RUNNING) && F_ISSET(txn, WT_TXN_HAS_ID));

	WT_ASSERT(session, txn->mod_count > 0);
	op = txn->mod + txn->mod_count - 1;

	WT_RET(__txn_logrec_init(session));
	logrec = txn->logrec;

	switch (op->type) {
	case WT_TXN_OP_BASIC:
		return (__txn_op_log(session, logrec, op, cbt));
	case WT_TXN_OP_INMEM:
	case WT_TXN_OP_REF:
		/* Nothing to log, we're done. */
		return (0);
	case WT_TXN_OP_TRUNCATE_COL:
		return (__wt_logop_col_truncate_pack(session, logrec,
		    op->fileid,
		    op->u.truncate_col.start, op->u.truncate_col.stop));
	case WT_TXN_OP_TRUNCATE_ROW:
		return (__wt_logop_row_truncate_pack(session, txn->logrec,
		    op->fileid,
		    &op->u.truncate_row.start, &op->u.truncate_row.stop,
		    (uint32_t)op->u.truncate_row.mode));
	WT_ILLEGAL_VALUE(session);
	}

	/* NOTREACHED */
}
예제 #14
0
/*
 * __wt_debug_disk --
 *	Dump a disk page in debugging mode.
 */
int
__wt_debug_disk(
    WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, const char *ofile)
{
	WT_DBG *ds, _ds;
	WT_DECL_RET;

	ds = &_ds;
	WT_RET(__debug_config(session, ds, ofile));

	__dmsg(ds, "%s page", __wt_page_type_string(dsk->type));
	switch (dsk->type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
		__dmsg(ds, ", recno %" PRIu64, dsk->recno);
		/* FALLTHROUGH */
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		__dmsg(ds, ", entries %" PRIu32 "\n", dsk->u.entries);
		break;
	case WT_PAGE_OVFL:
		__dmsg(ds, ", datalen %" PRIu32 "\n", dsk->u.datalen);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	switch (dsk->type) {
	case WT_PAGE_COL_FIX:
		__debug_dsk_col_fix(ds, dsk);
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		ret = __debug_dsk_cell(ds, dsk);
		break;
	default:
		break;
	}

	__dmsg_wrapup(ds);

	return (ret);
}
예제 #15
0
파일: bt_curnext.c 프로젝트: DINKIN/mongo
/*
 * __wt_cursor_key_order_init --
 *	Initialize key ordering checks for cursor movements after a successful
 * search.
 */
int
__wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
	/*
	 * Cursor searches set the position for cursor movements, set the
	 * last-key value for diagnostic checking.
	 */
	switch (cbt->ref->page->type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_VAR:
		cbt->lastrecno = cbt->recno;
		return (0);
	case WT_PAGE_ROW_LEAF:
		return (__wt_buf_set(session,
		    cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size));
	WT_ILLEGAL_VALUE(session);
	}
	/* NOTREACHED */
}
예제 #16
0
/*
 * __merge_walk --
 *	Visit all of the child references in a locked subtree and apply a
 *	callback function to them.
 */
static int
__merge_walk(WT_SESSION_IMPL *session, WT_PAGE *page, u_int depth,
	void (*visit)(WT_PAGE *, WT_REF *, WT_VISIT_STATE *),
	WT_VISIT_STATE *state)
{
	WT_PAGE *child;
	WT_REF *ref;
	uint32_t i;

	if (depth > state->maxdepth)
		state->maxdepth = depth;

	WT_REF_FOREACH(page, ref, i)
		switch (ref->state) {
		case WT_REF_LOCKED:
			child = ref->page;

			/*
			 * Visit internal pages recursively.  This must match
			 * the walk in __rec_review: if the merge succeeds, we
			 * have to unlock everything.
			 */
			if (child->type == page->type &&
			    __wt_btree_mergeable(child)) {
				WT_RET(__merge_walk(
				    session, child, depth + 1, visit, state));
				break;
			}
			/* FALLTHROUGH */

		case WT_REF_DELETED:
		case WT_REF_DISK:
			(*visit)(page, ref, state);
			break;

		case WT_REF_EVICT_WALK:
		case WT_REF_MEM:
		case WT_REF_READING:
		WT_ILLEGAL_VALUE(session);
		}

	return (0);
}
예제 #17
0
int
__wt_txn_op_printlog(
    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
{
	uint32_t optype, opsize;

	/* Peek at the size and the type. */
	WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize));
	end = *pp + opsize;

	switch (optype) {
	case WT_LOGOP_COL_PUT:
		WT_RET(__wt_logop_col_put_print(session, pp, end, out));
		break;

	case WT_LOGOP_COL_REMOVE:
		WT_RET(__wt_logop_col_remove_print(session, pp, end, out));
		break;

	case WT_LOGOP_COL_TRUNCATE:
		WT_RET(__wt_logop_col_truncate_print(session, pp, end, out));
		break;

	case WT_LOGOP_ROW_PUT:
		WT_RET(__wt_logop_row_put_print(session, pp, end, out));
		break;

	case WT_LOGOP_ROW_REMOVE:
		WT_RET(__wt_logop_row_remove_print(session, pp, end, out));
		break;

	case WT_LOGOP_ROW_TRUNCATE:
		WT_RET(__wt_logop_row_truncate_print(session, pp, end, out));
		break;

	WT_ILLEGAL_VALUE(session);
	}

	return (0);
}
예제 #18
0
/*
 * __wt_curbulk_init --
 *	Initialize a bulk cursor.
 */
int
__wt_curbulk_init(WT_SESSION_IMPL *session,
    WT_CURSOR_BULK *cbulk, bool bitmap, bool skip_sort_check)
{
	WT_CURSOR *c;
	WT_CURSOR_BTREE *cbt;

	c = &cbulk->cbt.iface;
	cbt = &cbulk->cbt;

	/* Bulk cursors only support insert and close (reset is a no-op). */
	__wt_cursor_set_notsup(c);
	switch (cbt->btree->type) {
	case BTREE_COL_FIX:
		c->insert = bitmap ?
		    __curbulk_insert_fix_bitmap : __curbulk_insert_fix;
		break;
	case BTREE_COL_VAR:
		c->insert = __curbulk_insert_var;
		break;
	case BTREE_ROW:
		/*
		 * Row-store order comparisons are expensive, so we optionally
		 * skip them when we know the input is correct.
		 */
		c->insert = skip_sort_check ?
		    __curbulk_insert_row_skip_check : __curbulk_insert_row;
		break;
	WT_ILLEGAL_VALUE(session);
	}

	cbulk->first_insert = true;
	cbulk->recno = 0;
	cbulk->bitmap = bitmap;
	if (bitmap)
		F_SET(c, WT_CURSTD_RAW);

	return (__wt_bulk_init(session, cbulk));
}
예제 #19
0
/*
 * __meta_track_apply --
 *	Apply the changes in a metadata tracking record.
 */
static int
__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_RET;
	int tret;

	switch (trk->op) {
	case WT_ST_EMPTY:	/* Unused slot */
		break;
	case WT_ST_CHECKPOINT:	/* Checkpoint, see above */
		btree = trk->dhandle->handle;
		bm = btree->bm;
		WT_WITH_DHANDLE(session, trk->dhandle,
		    WT_TRET(bm->checkpoint_resolve(bm, session)));
		break;
	case WT_ST_DROP_COMMIT:
		if ((tret = __wt_remove_if_exists(session, trk->a)) != 0) {
			__wt_err(session, tret,
			    "metadata remove dropped file %s",
			    trk->a);
			WT_TRET(tret);
		}
		break;
	case WT_ST_LOCK:
		WT_WITH_DHANDLE(session, trk->dhandle,
		    WT_TRET(__wt_session_release_btree(session)));
		break;
	case WT_ST_FILEOP:
	case WT_ST_REMOVE:
	case WT_ST_SET:
		break;
	WT_ILLEGAL_VALUE(session);
	}

	__meta_track_clear(session, trk);
	return (ret);
}
예제 #20
0
/*
 * __merge_promote_key --
 *	Copy a key from a child page into the reference in its parent, so it
 *	can be found by searches.
 */
static int
__merge_promote_key(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_PAGE *page;
	WT_REF *child_ref;
	size_t size;
	void *p;

	page = ref->page;
	switch (page->type) {
	case WT_PAGE_COL_INT:
		child_ref = &page->u.intl.t[0];
		ref->key.recno = page->u.intl.recno = child_ref->key.recno;
		return (0);

	case WT_PAGE_ROW_INT:
		child_ref = &page->u.intl.t[0];
		__wt_ref_key(child_ref->page, child_ref, &p, &size);
		return (__wt_row_ikey_incr(
		    session, page, 0, p, size, &ref->key.ikey));

	WT_ILLEGAL_VALUE(session);
	}
}
예제 #21
0
/*
 * __merge_promote_key --
 *	Copy a key from a child page into the reference in its parent, so it
 *	can be found by searches.
 */
static int
__merge_promote_key(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_IKEY *ikey;
	WT_PAGE *page;
	WT_REF *child_ref;

	page = ref->page;
	switch (page->type) {
	case WT_PAGE_COL_INT:
		child_ref = &page->u.intl.t[0];
		ref->u.recno = page->u.intl.recno = child_ref->u.recno;
		return (0);

	case WT_PAGE_ROW_INT:
		child_ref = &page->u.intl.t[0];
		ikey = child_ref->u.key;
		WT_ASSERT(session, ikey != NULL);
		return (__wt_row_ikey_incr(session,
		    page, 0, WT_IKEY_DATA(ikey), ikey->size, &ref->u.key));

	WT_ILLEGAL_VALUE(session);
	}
}
예제 #22
0
파일: bt_page.c 프로젝트: 3rf/mongo
/*
 * __wt_page_inmem --
 *	Build in-memory page information.
 */
int
__wt_page_inmem(WT_SESSION_IMPL *session,
    WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep)
{
	WT_DECL_RET;
	WT_PAGE *page;
	const WT_PAGE_HEADER *dsk;
	uint32_t alloc_entries;
	size_t size;

	*pagep = NULL;

	dsk = image;
	alloc_entries = 0;

	/*
	 * Figure out how many underlying objects the page references so we can
	 * allocate them along with the page.
	 */
	switch (dsk->type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
		/*
		 * Column-store leaf page entries map one-to-one to the number
		 * of physical entries on the page (each physical entry is a
		 * value item).
		 *
		 * Column-store internal page entries map one-to-one to the
		 * number of physical entries on the page (each entry is a
		 * location cookie).
		 */
		alloc_entries = dsk->u.entries;
		break;
	case WT_PAGE_ROW_INT:
		/*
		 * Row-store internal page entries map one-to-two to the number
		 * of physical entries on the page (each entry is a key and
		 * location cookie pair).
		 */
		alloc_entries = dsk->u.entries / 2;
		break;
	case WT_PAGE_ROW_LEAF:
		/*
		 * If the "no empty values" flag is set, row-store leaf page
		 * entries map one-to-one to the number of physical entries
		 * on the page (each physical entry is a key or value item).
		 * If that flag is not set, there are more keys than values,
		 * we have to walk the page to figure it out.
		 */
		if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL))
			alloc_entries = dsk->u.entries;
		else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
			alloc_entries = dsk->u.entries / 2;
		else
			WT_RET(__inmem_row_leaf_entries(
			    session, dsk, &alloc_entries));
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* Allocate and initialize a new WT_PAGE. */
	WT_RET(__wt_page_alloc(
	    session, dsk->type, dsk->recno, alloc_entries, 1, &page));
	page->dsk = dsk;
	F_SET_ATOMIC(page, flags);

	/*
	 * Track the memory allocated to build this page so we can update the
	 * cache statistics in a single call.
	 */
	size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0;

	switch (page->type) {
	case WT_PAGE_COL_FIX:
		__inmem_col_fix(session, page);
		break;
	case WT_PAGE_COL_INT:
		__inmem_col_int(session, page);
		break;
	case WT_PAGE_COL_VAR:
		WT_ERR(__inmem_col_var(session, page, &size));
		break;
	case WT_PAGE_ROW_INT:
		WT_ERR(__inmem_row_int(session, page, &size));
		break;
	case WT_PAGE_ROW_LEAF:
		WT_ERR(__inmem_row_leaf(session, page));
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

	/* Update the page's in-memory size and the cache statistics. */
	__wt_cache_page_inmem_incr(session, page, size);

	/* Link the new internal page to the parent. */
	if (ref != NULL) {
		switch (page->type) {
		case WT_PAGE_COL_INT:
		case WT_PAGE_ROW_INT:
			page->pg_intl_parent_ref = ref;
			break;
		}
		ref->page = page;
	}

	*pagep = page;
	return (0);

err:	__wt_page_out(session, &page);
	return (ret);
}
예제 #23
0
파일: bt_vrfy_dsk.c 프로젝트: radik/mongo
/*
 * __wt_verify_dsk_image --
 *	Verify a single block as read from disk.
 */
int
__wt_verify_dsk_image(WT_SESSION_IMPL *session,
    const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok)
{
	const uint8_t *p, *end;
	u_int i;
	uint8_t flags;

	/* Check the page type. */
	switch (dsk->type) {
	case WT_PAGE_BLOCK_MANAGER:
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
	case WT_PAGE_OVFL:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		break;
	case WT_PAGE_INVALID:
	default:
		WT_RET_VRFY(session,
		    "page at %s has an invalid type of %" PRIu32,
		    tag, dsk->type);
	}

	/* Check the page record number. */
	switch (dsk->type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
		if (dsk->recno != 0)
			break;
		WT_RET_VRFY(session,
		    "%s page at %s has a record number of zero",
		    __wt_page_type_string(dsk->type), tag);
	case WT_PAGE_BLOCK_MANAGER:
	case WT_PAGE_OVFL:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		if (dsk->recno == 0)
			break;
		WT_RET_VRFY(session,
		    "%s page at %s has a non-zero record number",
		    __wt_page_type_string(dsk->type), tag);
	}

	/* Check the page flags. */
	flags = dsk->flags;
	if (LF_ISSET(WT_PAGE_COMPRESSED))
		LF_CLR(WT_PAGE_COMPRESSED);
	if (LF_ISSET(WT_PAGE_ENCRYPTED))
		LF_CLR(WT_PAGE_ENCRYPTED);
	if (dsk->type == WT_PAGE_ROW_LEAF) {
		if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) &&
		    LF_ISSET(WT_PAGE_EMPTY_V_NONE))
			WT_RET_VRFY(session,
			    "page at %s has invalid flags combination: 0x%"
			    PRIx8,
			    tag, dsk->flags);
		if (LF_ISSET(WT_PAGE_EMPTY_V_ALL))
			LF_CLR(WT_PAGE_EMPTY_V_ALL);
		if (LF_ISSET(WT_PAGE_EMPTY_V_NONE))
			LF_CLR(WT_PAGE_EMPTY_V_NONE);
	}
	if (flags != 0)
		WT_RET_VRFY(session,
		    "page at %s has invalid flags set: 0x%" PRIx8,
		    tag, flags);

	/* Unused bytes */
	for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i)
		if (*p != '\0')
			WT_RET_VRFY(session,
			    "page at %s has non-zero unused page header bytes",
			    tag);

	/*
	 * Any bytes after the data chunk should be nul bytes; ignore if the
	 * size is 0, that allows easy checking of disk images where we don't
	 * have the size.
	 */
	if (size != 0) {
		p = (uint8_t *)dsk + dsk->mem_size;
		end = (uint8_t *)dsk + size;
		for (; p < end; ++p)
			if (*p != '\0')
				WT_RET_VRFY(session,
				    "%s page at %s has non-zero trailing bytes",
				    __wt_page_type_string(dsk->type), tag);
	}

	/* Check for empty pages, then verify the items on the page. */
	switch (dsk->type) {
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_VAR:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		if (!empty_page_ok && dsk->u.entries == 0)
			WT_RET_VRFY(session, "%s page at %s has no entries",
			    __wt_page_type_string(dsk->type), tag);
		break;
	case WT_PAGE_BLOCK_MANAGER:
	case WT_PAGE_OVFL:
		if (dsk->u.datalen == 0)
			WT_RET_VRFY(session, "%s page at %s has no data",
			    __wt_page_type_string(dsk->type), tag);
		break;
	}
	switch (dsk->type) {
	case WT_PAGE_COL_INT:
		return (__verify_dsk_col_int(session, tag, dsk));
	case WT_PAGE_COL_FIX:
		return (__verify_dsk_col_fix(session, tag, dsk));
	case WT_PAGE_COL_VAR:
		return (__verify_dsk_col_var(session, tag, dsk));
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		return (__verify_dsk_row(session, tag, dsk));
	case WT_PAGE_BLOCK_MANAGER:
	case WT_PAGE_OVFL:
		return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen));
	WT_ILLEGAL_VALUE(session);
	}
	/* NOTREACHED */
}
예제 #24
0
파일: bt_page.c 프로젝트: 3rf/mongo
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_DECL_RET;
	WT_PAGE *page;
	int busy, force_attempts, oldgen;

	for (force_attempts = oldgen = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, attempt to read it.
			 * Make sure there is space in the cache.
			 */
			WT_RET(__wt_cache_full_check(session));
			WT_RET(__wt_cache_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			/* FALLTHROUGH */
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			/* The page is busy -- wait. */
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory: get a hazard pointer, update
			 * the page's LRU and return.  The expected reason we
			 * can't get a hazard pointer is because the page is
			 * being evicted; yield and try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy)
				break;

			page = ref->page;
			WT_ASSERT(session, page != NULL);

			/* Forcibly evict pages that are too big. */
			if (!LF_ISSET(WT_READ_NO_EVICT) &&
			    force_attempts < 10 &&
			    __evict_force_check(session, page)) {
				++force_attempts;
				WT_RET(__wt_page_release(session, ref, flags));
				break;
			}

			/* Check if we need an autocommit transaction. */
			if ((ret = __wt_txn_autocommit_check(session)) != 0) {
				WT_TRET(__wt_hazard_clear(session, page));
				return (ret);
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_set(session);

			return (0);
		WT_ILLEGAL_VALUE(session);
		}

		/* We failed to get the page -- yield before retrying. */
		__wt_yield();
	}
}
예제 #25
0
/*
 * __wt_verify_dsk --
 *	Verify a single Btree page as read from disk.
 */
int
__wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
{
	WT_PAGE_HEADER *dsk;
	uint32_t size;
	uint8_t *p, *end;
	u_int i;

	dsk = buf->mem;
	size = buf->size;

	/* Check the page type. */
	switch (dsk->type) {
	case WT_PAGE_BLOCK_MANAGER:
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
	case WT_PAGE_OVFL:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		break;
	case WT_PAGE_INVALID:
	default:
		WT_RET_VRFY(session,
		    "page at %s has an invalid type of %" PRIu32,
		    addr, dsk->type);
	}

	/* Check the page record number. */
	switch (dsk->type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
		if (dsk->recno != 0)
			break;
		WT_RET_VRFY(session,
		    "%s page at %s has a record number of zero",
		    __wt_page_type_string(dsk->type), addr);
	case WT_PAGE_BLOCK_MANAGER:
	case WT_PAGE_OVFL:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		if (dsk->recno == 0)
			break;
		WT_RET_VRFY(session,
		    "%s page at %s has a non-zero record number",
		    __wt_page_type_string(dsk->type), addr);
	}

	/* Check the page flags. */
	switch (dsk->flags) {
	case 0:
	case WT_PAGE_COMPRESSED:
		break;
	default:
		WT_RET_VRFY(session,
		    "page at %s has an invalid flags value of 0x%" PRIx32,
		    addr, (uint32_t)dsk->flags);
	}

	/* Unused bytes */
	for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i)
		if (*p != '\0')
			WT_RET_VRFY(session,
			    "page at %s has non-zero unused page header bytes",
			    addr);

	/* Any bytes after the data chunk should be nul bytes. */
	p = (uint8_t *)dsk + dsk->mem_size;
	end = (uint8_t *)dsk + size;
	for (; p < end; ++p)
		if (*p != '\0')
			WT_RET_VRFY(session,
			    "%s page at %s has non-zero trailing bytes",
			    __wt_page_type_string(dsk->type), addr);

	/* Verify the items on the page. */
	switch (dsk->type) {
	case WT_PAGE_COL_INT:
		return (__verify_dsk_col_int(session, addr, dsk));
	case WT_PAGE_COL_FIX:
		return (__verify_dsk_col_fix(session, addr, dsk));
	case WT_PAGE_COL_VAR:
		return (__verify_dsk_col_var(session, addr, dsk));
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		return (__verify_dsk_row(session, addr, dsk));
	case WT_PAGE_BLOCK_MANAGER:
	case WT_PAGE_OVFL:
		return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen));
	WT_ILLEGAL_VALUE(session);
	}
	/* NOTREACHED */
}
예제 #26
0
파일: cur_join.c 프로젝트: mongodb/mongo
/*
 * __curjoin_entry_in_range --
 *	Check if a key is in the range specified by the entry, returning
 *	WT_NOTFOUND if not.
 */
static int
__curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
    WT_ITEM *curkey, WT_CURSOR_JOIN_ITER *iter)
{
	WT_COLLATOR *collator;
	WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
	u_int pos;
	int cmp;
	bool disjunction, passed;

	collator = (entry->index != NULL) ? entry->index->collator : NULL;
	endmax = &entry->ends[entry->ends_next];
	disjunction = F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION);

	/*
	 * The iterator may have already satisfied some endpoint conditions.
	 * If so and we're a disjunction, we're done.  If so and we're a
	 * conjunction, we can start past the satisfied conditions.
	 */
	if (iter == NULL)
		pos = 0;
	else {
		if (disjunction && iter->end_skip)
			return (0);
		pos = iter->end_pos + iter->end_skip;
	}

	for (end = &entry->ends[pos]; end < endmax; end++) {
		WT_RET(__wt_compare(session, collator, curkey, &end->key,
		    &cmp));
		switch (WT_CURJOIN_END_RANGE(end)) {
		case WT_CURJOIN_END_EQ:
			passed = (cmp == 0);
			break;

		case WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ:
			passed = (cmp >= 0);
			WT_ASSERT(session, iter == NULL);
			break;

		case WT_CURJOIN_END_GT:
			passed = (cmp > 0);
			if (passed && iter != NULL && pos == 0)
				iter->end_skip = 1;
			break;

		case WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ:
			passed = (cmp <= 0);
			break;

		case WT_CURJOIN_END_LT:
			passed = (cmp < 0);
			break;

		WT_ILLEGAL_VALUE(session, WT_CURJOIN_END_RANGE(end));
		}

		if (!passed) {
			if (iter != NULL &&
			    (iter->is_equal ||
			    F_ISSET(end, WT_CURJOIN_END_LT))) {
				WT_RET(__curjoin_iter_bump(iter));
				return (WT_NOTFOUND);
			}
			if (!disjunction)
				return (WT_NOTFOUND);
			iter = NULL;
		} else if (disjunction)
			break;
	}
	if (disjunction && end == endmax)
		return (WT_NOTFOUND);
	return (0);
}
예제 #27
0
/*
 * __wt_kv_return --
 *	Return a page referenced key/value pair to the application.
 */
int
__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
{
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_CURSOR *cursor;
	WT_ITEM *tmp;
	WT_PAGE *page;
	WT_ROW *rip;
	uint8_t v;

	btree = S2BT(session);

	page = cbt->ref->page;
	cursor = &cbt->iface;

	switch (page->type) {
	case WT_PAGE_COL_FIX:
		/*
		 * The interface cursor's record has usually been set, but that
		 * isn't universally true, specifically, cursor.search_near may
		 * call here without first setting the interface cursor.
		 */
		cursor->recno = cbt->recno;

		/* If the cursor references a WT_UPDATE item, return it. */
		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}

		/* Take the value from the original page. */
		v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
		return (__wt_buf_set(session, &cursor->value, &v, 1));
	case WT_PAGE_COL_VAR:
		/*
		 * The interface cursor's record has usually been set, but that
		 * isn't universally true, specifically, cursor.search_near may
		 * call here without first setting the interface cursor.
		 */
		cursor->recno = cbt->recno;

		/* If the cursor references a WT_UPDATE item, return it. */
		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}

		/* Take the value from the original page cell. */
		cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]);
		break;
	case WT_PAGE_ROW_LEAF:
		rip = &page->pg_row_d[cbt->slot];

		/*
		 * If the cursor references a WT_INSERT item, take its key.
		 * Else, if we have an exact match, we copied the key in the
		 * search function, take it from there.
		 * If we don't have an exact match, take the key from the
		 * original page.
		 */
		if (cbt->ins != NULL) {
			cursor->key.data = WT_INSERT_KEY(cbt->ins);
			cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
		} else if (cbt->compare == 0) {
			/*
			 * If not in an insert list and there's an exact match,
			 * the row-store search function built the key we want
			 * to return in the cursor's temporary buffer. Swap the
			 * cursor's search-key and temporary buffers so we can
			 * return it (it's unsafe to return the temporary buffer
			 * itself because our caller might do another search in
			 * this table using the key we return, and we'd corrupt
			 * the search key during any subsequent search that used
			 * the temporary buffer.
			 */
			tmp = cbt->row_key;
			cbt->row_key = cbt->tmp;
			cbt->tmp = tmp;

			cursor->key.data = cbt->row_key->data;
			cursor->key.size = cbt->row_key->size;
		} else
			WT_RET(__wt_row_leaf_key(
			    session, page, rip, &cursor->key, false));

		/* If the cursor references a WT_UPDATE item, return it. */
		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}

		/* Simple values have their location encoded in the WT_ROW. */
		if (__wt_row_leaf_value(page, rip, &cursor->value))
			return (0);

		/*
		 * Take the value from the original page cell (which may be
		 * empty).
		 */
		if ((cell =
		    __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) {
			cursor->value.size = 0;
			return (0);
		}
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* The value is an on-page cell, unpack and expand it as necessary. */
	__wt_cell_unpack(cell, &unpack);
	WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value));

	return (0);
}
예제 #28
0
파일: bt_ovfl.c 프로젝트: qihsh/mongo
/*
 * __wt_ovfl_cache --
 *	Handle deletion of an overflow value.
 */
int
__wt_ovfl_cache(WT_SESSION_IMPL *session,
    WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack)
{
	int visible;

	/*
	 * This function solves a problem in reconciliation. The scenario is:
	 *     - reconciling a leaf page that references an overflow item
	 *     - the item is updated and the update committed
	 *     - a checkpoint runs, freeing the backing overflow blocks
	 *     - a snapshot transaction wants the original version of the item
	 *
	 * In summary, we may need the original version of an overflow item for
	 * a snapshot transaction after the item was deleted from a page that's
	 * subsequently been checkpointed, where the checkpoint must know about
	 * the freed blocks.  We don't have any way to delay a free of the
	 * underlying blocks until a particular set of transactions exit (and
	 * this shouldn't be a common scenario), so cache the overflow value in
	 * memory.
	 *
	 * This gets hard because the snapshot transaction reader might:
	 *     - search the WT_UPDATE list and not find an useful entry
	 *     - read the overflow value's address from the on-page cell
	 *     - go to sleep
	 *     - checkpoint runs, caches the overflow value, frees the blocks
	 *     - another thread allocates and overwrites the blocks
	 *     - the reader wakes up and reads the wrong value
	 *
	 * Use a read/write lock and the on-page cell to fix the problem: hold
	 * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to
	 * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow
	 * item.
	 *
	 * The read/write lock is per btree, but it could be per page or even
	 * per overflow item.  We don't do any of that because overflow values
	 * are supposed to be rare and we shouldn't see contention for the lock.
	 *
	 * Check for a globally visible update.  If there is a globally visible
	 * update, we don't need to cache the item because it's not possible for
	 * a running thread to have moved past it.
	 */
	switch (page->type) {
	case WT_PAGE_COL_VAR:
		visible = __ovfl_cache_col_visible(session, cookie, vpack);
		break;
	case WT_PAGE_ROW_LEAF:
		visible = __ovfl_cache_row_visible(session, page, cookie);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/*
	 * If there's no globally visible update, there's a reader in the system
	 * that might try and read the old value, cache it.
	 */
	if (!visible) {
		WT_RET(__ovfl_cache(session, page, vpack));
		WT_STAT_FAST_DATA_INCR(session, cache_overflow_value);
	}

	/*
	 * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the
	 * underlying overflow value's blocks to be freed when reconciliation
	 * completes.
	 */
	return (__wt_ovfl_discard_add(session, page, vpack->cell));
}
예제 #29
0
int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
{
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_CURSOR *cursor;
	WT_PAGE *page;
	WT_ROW *rip;
	uint8_t v;

	switch (page->type){
	case WT_PAGE_COL_FIX:
		cursor->recno = cbt->recno;
		/*cursor对应的是一个upd,直接返回value*/
		if (upd != NULL){
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return 0;
		}

		v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
		return __wt_buf_set(session, &cursor->value, &v, 1);
		
	case WT_PAGE_COL_VAR:
		cursor->recno = cbt->recno;

		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}
		/*获得对应的cell,并通过cell得到K/V值*/
		cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]);
		break;

	case WT_PAGE_ROW_LEAF:
		rip = &page->pg_row_d[cbt->slot];

		if (cbt->ins != NULL){ /*插入的k/v对*/
			cursor->key.data = WT_INSERT_KEY(cbt->ins);
			cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
		}
		else if (cbt->compare == 0){/*比较器定位到了对应的k/v对*/
			cursor->key.data = cbt->search_key.data;
			cursor->key.size = cbt->search_key.size;
		}
		else
			WT_RET(__wt_row_leaf_key(session, page, rip, &cursor->key, 0)); /*设置key的值*/

		/*值是在append/update list当中,从当中取*/
		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}
		/*可以直接通过rip指针获得value,K/V是存储在cell空间之内*/
		if (__wt_row_leaf_value(page, rip, &cursor->value))
			return 0;

		/*不是连续存储的,需要通过解析cell来定位到value*/
		if (cell = __wt_row_leaf_value_cell(page, rip, NULL) == NULL){
			cursor->value.size = 0;
			return 0;
		}
		break;

		WT_ILLEGAL_VALUE(session);
	}
	/*通过cell解析到对应的value值, ovfl item*/
	__wt_cell_unpack(cell, &unpack);
	WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value));

	return 0;
}
예제 #30
0
파일: bt_read.c 프로젝트: qihsh/mongo
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	int busy, cache_work, force_attempts, oldgen, stalled;

	btree = S2BT(session);
	stalled = 0;

	for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, read it. If this thread is
			 * allowed to do eviction work, check for space in the
			 * cache.
			 */
			if (!LF_ISSET(WT_READ_NO_EVICT))
				WT_RET(__wt_cache_eviction_check(
				    session, 1, NULL));
			WT_RET(__page_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on another thread's read, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			stalled = 1;
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on eviction, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			stalled = 1;
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory.
			 *
			 * Get a hazard pointer if one is required. We cannot
			 * be evicting if no hazard pointer is required, we're
			 * done.
			 */
			if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
				goto skip_evict;

			/*
			 * The expected reason we can't get a hazard pointer is
			 * because the page is being evicted, yield, try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			/*
			 * If eviction is configured for this file, check to see
			 * if the page qualifies for forced eviction and update
			 * the page's generation number. If eviction isn't being
			 * done on this file, we're done.
			 */
			if (LF_ISSET(WT_READ_NO_EVICT) ||
			    F_ISSET(session, WT_SESSION_NO_EVICTION) ||
			    F_ISSET(btree, WT_BTREE_NO_EVICTION))
				goto skip_evict;

			/*
			 * Forcibly evict pages that are too big.
			 */
			page = ref->page;
			if (force_attempts < 10 &&
			    __evict_force_check(session, page)) {
				++force_attempts;
				ret = __wt_page_release_evict(session, ref);
				/* If forced eviction fails, stall. */
				if (ret == EBUSY) {
					ret = 0;
					WT_STAT_FAST_CONN_INCR(session,
					    page_forcible_evict_blocked);
					stalled = 1;
					break;
				}
				WT_RET(ret);

				/*
				 * The result of a successful forced eviction
				 * is a page-state transition (potentially to
				 * an in-memory page we can use, or a restart
				 * return for our caller), continue the outer
				 * page-acquisition loop.
				 */
				continue;
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen != WT_READGEN_OLDEST &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_bump(session);
skip_evict:
			/*
			 * Check if we need an autocommit transaction.
			 * Starting a transaction can trigger eviction, so skip
			 * it if eviction isn't permitted.
			 */
			return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
			    __wt_txn_autocommit_check(session));
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (stalled)
			wait_cnt += 1000;
		else if (++wait_cnt < 1000) {
			__wt_yield();
			continue;
		}

		/*
		 * If stalling and this thread is allowed to do eviction work,
		 * check if the cache needs help. If we do work for the cache,
		 * substitute that for a sleep.
		 */
		if (!LF_ISSET(WT_READ_NO_EVICT)) {
			WT_RET(
			    __wt_cache_eviction_check(session, 1, &cache_work));
			if (cache_work)
				continue;
		}
		sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
		WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
		__wt_sleep(0, sleep_cnt);
	}
}