Beispiel #1
0
/*
 * __cursor_var_append_next --
 *	Return the next variable-length entry on the append list.
 */
static inline int
__cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	if (newpage) {
		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
		goto new_page;
	}

	for (;;) {
		cbt->ins = WT_SKIP_NEXT(cbt->ins);
new_page:	if (cbt->ins == NULL)
			return (WT_NOTFOUND);

		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL)
			continue;
		if (upd->type == WT_UPDATE_DELETED) {
			if (__wt_txn_upd_visible_all(session, upd))
				++cbt->page_deleted_count;
			continue;
		}
		return (__wt_value_return(session, cbt, upd));
	}
	/* NOTREACHED */
}
Beispiel #2
0
/*
 * __cursor_var_append_next --
 *	Return the next variable-length entry on the append list.
 */
static inline int
__cursor_var_append_next(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	if (newpage) {
		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
		goto new_page;
	}

	for (;;) {
		cbt->ins = WT_SKIP_NEXT(cbt->ins);
new_page:	if (cbt->ins == NULL)
			return (WT_NOTFOUND);

		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
		    WT_UPDATE_DELETED_ISSET(upd))
			continue;
		val->data = WT_UPDATE_DATA(upd);
		val->size = upd->size;
		break;
	}
	return (0);
}
Beispiel #3
0
/*btree cursor移向下一个记录,仅仅在append list上移动*/
static inline int __cursor_fix_append_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	/*新载入的page,判断ins_head是否为空,如果为空表示没有append的记录*/
	if (newpage){
		if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL)
			return (WT_NOTFOUND);
	}
	else{
		/*已经到append list的最后一条记录了,后面没有记录*/
		if (cbt->recno >= WT_INSERT_RECNO(cbt->ins) && (cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL)
			return (WT_NOTFOUND);
	}
	/*
	* This code looks different from the cursor-previous code.  The append
	* list appears on the last page of the tree, but it may be preceded by
	* other rows, which means the cursor's recno will be set to a value and
	* we simply want to increment it.  If the cursor's recno is NOT set,
	* we're starting our iteration in a tree that has only appended items.
	* In that case, recno will be 0 and happily enough the increment will
	* set it to 1, which is correct.
	*/
	__cursor_set_recno(cbt, cbt->recno + 1);

	/*
	* Fixed-width column store appends are inherently non-transactional.
	* Even a non-visible update by a concurrent or aborted transaction
	* changes the effective end of the data.  The effect is subtle because
	* of the blurring between deleted and empty values, but ideally we
	* would skip all uncommitted changes at the end of the data.  This
	* doesn't apply to variable-width column stores because the implicitly
	* created records written by reconciliation are deleted and so can be
	* never seen by a read.
	*
	* The problem is that we don't know at this point whether there may be
	* multiple uncommitted changes at the end of the data, and it would be
	* expensive to check every time we hit an aborted update.  If an
	* insert is aborted, we simply return zero (empty), regardless of
	* whether we are at the end of the data.
	*/
	if (cbt->recno < WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL){ /*没有可见的记录值,直接返回0*/
		cbt->v = 0;
		val->data = &cbt->v;
	}
	else
		val->data = WT_UPDATE_DATA(upd);
	val->size = 1;
	return 0;
}
Beispiel #4
0
/*
 * __free_page_col_fix --
 *	Discard a WT_PAGE_COL_FIX page.
 */
static void
__free_page_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_INSERT_HEAD *append;

	/* Free the append array. */
	if ((append = WT_COL_APPEND(page)) != NULL) {
		__free_skip_list(session, WT_SKIP_FIRST(append));
		__wt_free(session, append);
		__wt_free(session, page->modify->append);
	}

	/* Free the update array. */
	if (page->modify != NULL && page->modify->update != NULL)
		__free_skip_array(session, page->modify->update, 1);
}
Beispiel #5
0
/*
 * __free_page_modify --
 *	Discard the page's associated modification structures.
 */
static void
__free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_INSERT_HEAD *append;
	WT_PAGE_MODIFY *mod;

	mod = page->modify;

	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
	case WT_PM_REC_SPLIT:
		/*
		 * If the page split, there may one or more pages linked from
		 * the page; walk the list, discarding pages.
		 */
		__wt_page_out(session, &mod->u.split);
		break;
	case WT_PM_REC_REPLACE:
		/*
		 * Discard any replacement address: this memory is usually moved
		 * into the parent's WT_REF, but at the root that can't happen.
		 */
		__wt_free(session, mod->u.replace.addr);
		break;
	default:
		break;
	}

	/* Free the append array. */
	if ((append = WT_COL_APPEND(page)) != NULL) {
		__free_skip_list(session, WT_SKIP_FIRST(append));
		__wt_free(session, append);
		__wt_free(session, mod->append);
	}

	/* Free the insert/update array. */
	if (mod->update != NULL)
		__free_skip_array(session, mod->update,
		    page->type == WT_PAGE_COL_FIX ? 1 : page->entries);

	/* Discard any objects the page was tracking plus associated memory. */
	__wt_rec_track_discard(session, page);
	__wt_free(session, mod->track);
	__wt_free(session, page->modify);
}
Beispiel #6
0
/*
 * __free_skip_array --
 *	Discard an array of skip list headers.
 */
static void
__free_skip_array(
    WT_SESSION_IMPL *session, WT_INSERT_HEAD **head_arg, uint32_t entries)
{
	WT_INSERT_HEAD **head;

	/*
	 * For each non-NULL slot in the page's array of inserts, free the
	 * linked list anchored in that slot.
	 */
	for (head = head_arg; entries > 0; --entries, ++head)
		if (*head != NULL) {
			__free_skip_list(session, WT_SKIP_FIRST(*head));
			__wt_free(session, *head);
		}

	/* Free the page's array of inserts. */
	__wt_free(session, head_arg);
}
Beispiel #7
0
/*
 * __free_page_col_var --
 *	Discard a WT_PAGE_COL_VAR page.
 */
static void
__free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_INSERT_HEAD *append;

	/* Free the in-memory index array. */
	__wt_free(session, page->u.col_var.d);

	/* Free the RLE lookup array. */
	__wt_free(session, page->u.col_var.repeats);

	/* Free the append array. */
	if ((append = WT_COL_APPEND(page)) != NULL) {
		__free_skip_list(session, WT_SKIP_FIRST(append));
		__wt_free(session, append);
		__wt_free(session, page->modify->append);
	}

	/* Free the insert array. */
	if (page->modify != NULL && page->modify->update != NULL)
		__free_skip_array(session, page->modify->update, page->entries);
}
Beispiel #8
0
/*在append list上移动variable-length类型的btree cursor*/
static inline int __cursor_var_append_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	if (newpage){
		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
		goto new_page;
	}

	for (;;){
		cbt->ins = WT_SKIP_NEXT(cbt->ins);
new_page:
		if (cbt->ins == NULL)
			return (WT_NOTFOUND);

		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
		/*事务隔离读,对本事务不可见,继续向前*/
		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL)
			continue;
		/*删除集合,不做指向这条记录,继续向下移动*/
		if (WT_UPDATE_DELETED_ISSET(upd)) {
			++cbt->page_deleted_count;
			continue;
		}
		/*赋值value*/
		val->data = WT_UPDATE_DATA(upd);
		val->size = upd->size;

		return 0;
	}
}
Beispiel #9
0
/*
 * __cursor_row_next --
 *	Move to the next row-store item.
 */
static inline int
__cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
	WT_INSERT *ins;
	WT_ITEM *key;
	WT_PAGE *page;
	WT_ROW *rip;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	key = &cbt->iface.key;

	/*
	 * For row-store pages, we need a single item that tells us the part
	 * of the page we're walking (otherwise switching from next to prev
	 * and vice-versa is just too complicated), so we map the WT_ROW and
	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
	 *
	 * Initialize for each new page.
	 */
	if (newpage) {
		cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
		cbt->row_iteration_slot = 1;
		cbt->rip_saved = NULL;
		goto new_insert;
	}

	/* Move to the next entry and return the item. */
	for (;;) {
		/*
		 * Continue traversing any insert list; maintain the insert list
		 * head reference and entry count in case we switch to a cursor
		 * previous movement.
		 */
		if (cbt->ins != NULL)
			cbt->ins = WT_SKIP_NEXT(cbt->ins);

new_insert:	if ((ins = cbt->ins) != NULL) {
			if ((upd = __wt_txn_read(session, ins->upd)) == NULL)
				continue;
			if (upd->type == WT_UPDATE_DELETED) {
				if (__wt_txn_upd_visible_all(session, upd))
					++cbt->page_deleted_count;
				continue;
			}
			key->data = WT_INSERT_KEY(ins);
			key->size = WT_INSERT_KEY_SIZE(ins);
			return (__wt_value_return(session, cbt, upd));
		}

		/* Check for the end of the page. */
		if (cbt->row_iteration_slot >= page->entries * 2 + 1)
			return (WT_NOTFOUND);
		++cbt->row_iteration_slot;

		/*
		 * Odd-numbered slots configure as WT_INSERT_HEAD entries,
		 * even-numbered slots configure as WT_ROW entries.
		 */
		if (cbt->row_iteration_slot & 0x01) {
			cbt->ins_head = WT_ROW_INSERT_SLOT(
			    page, cbt->row_iteration_slot / 2 - 1);
			cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
			goto new_insert;
		}
		cbt->ins_head = NULL;
		cbt->ins = NULL;

		cbt->slot = cbt->row_iteration_slot / 2 - 1;
		rip = &page->pg_row[cbt->slot];
		upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
		if (upd != NULL && upd->type == WT_UPDATE_DELETED) {
			if (__wt_txn_upd_visible_all(session, upd))
				++cbt->page_deleted_count;
			continue;
		}
		return (__cursor_row_slot_return(cbt, rip, upd));
	}
	/* NOTREACHED */
}
Beispiel #10
0
/*移向行存储的下一个行对象*/
static inline int __cursor_row_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_INSERT *ins;
	WT_ITEM *key, *val;
	WT_PAGE *page;
	WT_ROW *rip;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	key = &cbt->iface.key;
	val = &cbt->iface.value;

	/*假如是newpage,定位到insert修改队列的头位置*/
	if (newpage){
		cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
		cbt->row_iteration_slot = 1;
		goto new_insert;
	}

	for (;;){
		if (cbt->ins != NULL)
			cbt->ins = WT_SKIP_NEXT(cbt->ins);

new_insert:
		if ((ins = cbt->ins) != NULL) {
			/*事务可见数据读取*/
			if ((upd = __wt_txn_read(session, ins->upd)) == NULL)
				continue;
			/*判断是否删除,如果删除,跳过被删除的对象*/
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				++cbt->page_deleted_count;
				continue;
			}

			key->data = WT_INSERT_KEY(ins);
			key->size = WT_INSERT_KEY_SIZE(ins);
			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return 0;
		}

		/*检索page row entires数组, 到了page的末尾*/
		if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1)
			return (WT_NOTFOUND);
		++cbt->row_iteration_slot;

		/*
		* Odd-numbered slots configure as WT_INSERT_HEAD entries,
		* even-numbered slots configure as WT_ROW entries.
		*/
		if (cbt->row_iteration_slot & 0x01) {
			cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->row_iteration_slot / 2 - 1);
			cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
			goto new_insert;
		}

		cbt->ins_head = NULL;
		cbt->ins = NULL;

		/*计算定位slot*/
		cbt->slot = cbt->row_iteration_slot / 2 - 1;
		rip = &page->pg_row_d[cbt->slot];

		upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
		if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) {
			++cbt->page_deleted_count;
			continue;
		}

		return __cursor_row_slot_return(cbt, rip, upd);
	}
}
Beispiel #11
0
/*
 * __free_page_modify --
 *	Discard the page's associated modification structures.
 */
static void
__free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_INSERT_HEAD *append;
	WT_MULTI *multi;
	WT_PAGE_MODIFY *mod;
	uint32_t i;
	bool update_ignore;

	mod = page->modify;

	/* In some failed-split cases, we can't discard updates. */
	update_ignore = F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE);

	switch (mod->rec_result) {
	case WT_PM_REC_MULTIBLOCK:
		/* Free list of replacement blocks. */
		for (multi = mod->mod_multi,
		    i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
			switch (page->type) {
			case WT_PAGE_ROW_INT:
			case WT_PAGE_ROW_LEAF:
				__wt_free(session, multi->key.ikey);
				break;
			}
			__wt_free(session, multi->supd);
			__wt_free(session, multi->disk_image);
			__wt_free(session, multi->addr.addr);
		}
		__wt_free(session, mod->mod_multi);
		break;
	case WT_PM_REC_REPLACE:
		/*
		 * Discard any replacement address: this memory is usually moved
		 * into the parent's WT_REF, but at the root that can't happen.
		 */
		__wt_free(session, mod->mod_replace.addr);
		break;
	}

	switch (page->type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_VAR:
		/* Free the append array. */
		if ((append = WT_COL_APPEND(page)) != NULL) {
			__free_skip_list(
			    session, WT_SKIP_FIRST(append), update_ignore);
			__wt_free(session, append);
			__wt_free(session, mod->mod_append);
		}

		/* Free the insert/update array. */
		if (mod->mod_update != NULL)
			__free_skip_array(session, mod->mod_update,
			    page->type ==
			    WT_PAGE_COL_FIX ? 1 : page->pg_var_entries,
			    update_ignore);
		break;
	}

	/* Free the overflow on-page, reuse and transaction-cache skiplists. */
	__wt_ovfl_reuse_free(session, page);
	__wt_ovfl_txnc_free(session, page);
	__wt_ovfl_discard_free(session, page);

	__wt_free(session, page->modify->ovfl_track);

	__wt_free(session, page->modify);
}
Beispiel #12
0
/*
 * __wt_row_modify --
 *	Row-store insert, update and delete.
 */
int
__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
{
	WT_DECL_RET;
	WT_INSERT *ins;
	WT_INSERT_HEAD *ins_head, **ins_headp;
	WT_ITEM *key, *value;
	WT_PAGE *page;
	WT_UPDATE *old_upd, *upd, **upd_entry;
	size_t ins_size, upd_size;
	uint32_t ins_slot;
	u_int i, skipdepth;
	int logged;

	key = &cbt->iface.key;
	value = is_remove ? NULL : &cbt->iface.value;

	page = cbt->page;

	/* If we don't yet have a modify structure, we'll need one. */
	WT_RET(__wt_page_modify_init(session, page));

	ins = NULL;
	upd = NULL;
	logged = 0;

	/*
	 * Modify: allocate an update array as necessary, build a WT_UPDATE
	 * structure, and call a serialized function to insert the WT_UPDATE
	 * structure.
	 *
	 * Insert: allocate an insert array as necessary, build a WT_INSERT
	 * and WT_UPDATE structure pair, and call a serialized function to
	 * insert the WT_INSERT structure.
	 */
	if (cbt->compare == 0) {
		if (cbt->ins == NULL) {
			/* Allocate an update array as necessary. */
			WT_PAGE_ALLOC_AND_SWAP(session, page,
			    page->u.row.upd, upd_entry, page->entries);

			/* Set the WT_UPDATE array reference. */
			upd_entry = &page->u.row.upd[cbt->slot];
		} else
			upd_entry = &cbt->ins->upd;

		/* Make sure the update can proceed. */
		WT_ERR(__wt_txn_update_check(session, old_upd = *upd_entry));

		/* Allocate the WT_UPDATE structure and transaction ID. */
		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
		WT_ERR(__wt_txn_modify(session, cbt, upd));
		logged = 1;

		/*
		 * Point the new WT_UPDATE item to the next element in the list.
		 * If we get it right, the serialization function lock acts as
		 * our memory barrier to flush this write.
		 */
		upd->next = old_upd;

		/* Serialize the update. */
		WT_ERR(__wt_update_serial(
		    session, page, upd_entry, &upd, upd_size));
	} else {
		/*
		 * Allocate the insert array as necessary.
		 *
		 * We allocate an additional insert array slot for insert keys
		 * sorting less than any key on the page.  The test to select
		 * that slot is baroque: if the search returned the first page
		 * slot, we didn't end up processing an insert list, and the
		 * comparison value indicates the search key was smaller than
		 * the returned slot, then we're using the smallest-key insert
		 * slot.  That's hard, so we set a flag.
		 */
		WT_PAGE_ALLOC_AND_SWAP(session, page,
		    page->u.row.ins, ins_headp, page->entries + 1);

		ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ?
		    page->entries : cbt->slot;
		ins_headp = &page->u.row.ins[ins_slot];

		/* Allocate the WT_INSERT_HEAD structure as necessary. */
		WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
		ins_head = *ins_headp;

		/* Choose a skiplist depth for this insert. */
		skipdepth = __wt_skip_choose_depth();

		/*
		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
		 * update the cursor to reference it.
		 */
		WT_ERR(__wt_row_insert_alloc(
		    session, key, skipdepth, &ins, &ins_size));
		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
		ins->upd = upd;
		ins_size += upd_size;

		/*
		 * Update the cursor: the WT_INSERT_HEAD might be allocated,
		 * the WT_INSERT was allocated.
		 */
		cbt->ins_head = ins_head;
		cbt->ins = ins;
		WT_ERR(__wt_txn_modify(session, cbt, upd));
		logged = 1;

		/*
		 * If there was no insert list during the search, the cursor's
		 * information cannot be correct, search couldn't have
		 * initialized it.
		 *
		 * Otherwise, point the new WT_INSERT item's skiplist to the
		 * next elements in the insert list (which we will check are
		 * still valid inside the serialization function).
		 *
		 * The serial mutex acts as our memory barrier to flush these
		 * writes before inserting them into the list.
		 */
		if (WT_SKIP_FIRST(ins_head) == NULL)
			for (i = 0; i < skipdepth; i++) {
				cbt->ins_stack[i] = &ins_head->head[i];
				ins->next[i] = cbt->next_stack[i] = NULL;
			}
		else
			for (i = 0; i < skipdepth; i++)
				ins->next[i] = cbt->next_stack[i];

		/* Insert the WT_INSERT structure. */
		WT_ERR(__wt_insert_serial(
		    session, page, cbt->ins_head, cbt->ins_stack,
		    &ins, ins_size, skipdepth));
	}

	if (0) {
err:		/*
		 * Remove the update from the current transaction, so we don't
		 * try to modify it on rollback.
		 */
		if (logged)
			__wt_txn_unmodify(session);
		__wt_free(session, ins);
		cbt->ins = NULL;
		__wt_free(session, upd);
	}

	return (ret);
}
Beispiel #13
0
/*
 * __wt_col_modify --
 *	Column-store delete, insert, and update.
 */
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
    uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_INSERT *ins;
	WT_INSERT_HEAD *ins_head, **ins_headp;
	WT_ITEM _value;
	WT_PAGE *page;
	WT_UPDATE *old_upd;
	size_t ins_size, upd_size;
	u_int i, skipdepth;
	int append, logged;

	btree = cbt->btree;
	ins = NULL;
	page = cbt->ref->page;
	append = logged = 0;

	/* This code expects a remove to have a NULL value. */
	if (is_remove) {
		if (btree->type == BTREE_COL_FIX) {
			value = &_value;
			value->data = "";
			value->size = 1;
		} else
			value = NULL;
	} else {
		/*
		 * There's some chance the application specified a record past
		 * the last record on the page.  If that's the case, and we're
		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
		 * append list, not the update list. In addition, a recno of 0
		 * implies an append operation, we're allocating a new row.
		 */
		if (recno == 0 ||
		    recno > (btree->type == BTREE_COL_VAR ?
		    __col_var_last_recno(page) : __col_fix_last_recno(page)))
			append = 1;
	}

	/* If we don't yet have a modify structure, we'll need one. */
	WT_RET(__wt_page_modify_init(session, page));

	/*
	 * Delete, insert or update a column-store entry.
	 *
	 * If modifying a previously modified record, create a new WT_UPDATE
	 * entry and have a serialized function link it into an existing
	 * WT_INSERT entry's WT_UPDATE list.
	 *
	 * Else, allocate an insert array as necessary, build a WT_INSERT and
	 * WT_UPDATE structure pair, and call a serialized function to insert
	 * the WT_INSERT structure.
	 */
	if (cbt->compare == 0 && cbt->ins != NULL) {
		/*
		 * If we are restoring updates that couldn't be evicted, the
		 * key must not exist on the new page.
		 */
		WT_ASSERT(session, upd == NULL);

		/* Make sure the update can proceed. */
		WT_ERR(__wt_txn_update_check(
		    session, old_upd = cbt->ins->upd));

		/* Allocate a WT_UPDATE structure and transaction ID. */
		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
		WT_ERR(__wt_txn_modify(session, upd));
		logged = 1;

		/* Avoid a data copy in WT_CURSOR.update. */
		cbt->modify_update = upd;

		/*
		 * Point the new WT_UPDATE item to the next element in the list.
		 * If we get it right, the serialization function lock acts as
		 * our memory barrier to flush this write.
		 */
		upd->next = old_upd;

		/* Serialize the update. */
		WT_ERR(__wt_update_serial(
		    session, page, &cbt->ins->upd, &upd, upd_size));
	} else {
		/* Allocate the append/update list reference as necessary. */
		if (append) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, page->modify->mod_append, ins_headp, 1);
			ins_headp = &page->modify->mod_append[0];
		} else if (page->type == WT_PAGE_COL_FIX) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, page->modify->mod_update, ins_headp, 1);
			ins_headp = &page->modify->mod_update[0];
		} else {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, page->modify->mod_update, ins_headp,
			    page->pg_var_entries);
			ins_headp = &page->modify->mod_update[cbt->slot];
		}

		/* Allocate the WT_INSERT_HEAD structure as necessary. */
		WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
		ins_head = *ins_headp;

		/* Choose a skiplist depth for this insert. */
		skipdepth = __wt_skip_choose_depth(session);

		/*
		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
		 * update the cursor to reference it (the WT_INSERT_HEAD might
		 * be allocated, the WT_INSERT was allocated).
		 */
		WT_ERR(__col_insert_alloc(
		    session, recno, skipdepth, &ins, &ins_size));
		cbt->ins_head = ins_head;
		cbt->ins = ins;

		if (upd == NULL) {
			WT_ERR(
			    __wt_update_alloc(session, value, &upd, &upd_size));
			WT_ERR(__wt_txn_modify(session, upd));
			logged = 1;

			/* Avoid a data copy in WT_CURSOR.update. */
			cbt->modify_update = upd;
		} else
			upd_size = __wt_update_list_memsize(upd);
		ins->upd = upd;
		ins_size += upd_size;

		/*
		 * If there was no insert list during the search, or there was
		 * no search because the record number has not been allocated
		 * yet, the cursor's information cannot be correct, search
		 * couldn't have initialized it.
		 *
		 * Otherwise, point the new WT_INSERT item's skiplist to the
		 * next elements in the insert list (which we will check are
		 * still valid inside the serialization function).
		 *
		 * The serial mutex acts as our memory barrier to flush these
		 * writes before inserting them into the list.
		 */
		if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0)
			for (i = 0; i < skipdepth; i++) {
				cbt->ins_stack[i] = &ins_head->head[i];
				ins->next[i] = cbt->next_stack[i] = NULL;
			}
		else
			for (i = 0; i < skipdepth; i++)
				ins->next[i] = cbt->next_stack[i];

		/* Append or insert the WT_INSERT structure. */
		if (append)
			WT_ERR(__wt_col_append_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, &cbt->recno, skipdepth));
		else
			WT_ERR(__wt_insert_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, skipdepth));
	}

	/* If the update was successful, add it to the in-memory log. */
	if (logged)
		WT_ERR(__wt_txn_log_op(session, cbt));

	if (0) {
err:		/*
		 * Remove the update from the current transaction, so we don't
		 * try to modify it on rollback.
		 */
		if (logged)
			__wt_txn_unmodify(session);
		__wt_free(session, ins);
		__wt_free(session, upd);
	}

	return (ret);
}