Пример #1
0
/*
 * __wt_compact --
 *	Compact a file.
 */
int
__wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_BM *bm;
	WT_CONFIG_ITEM cval;
	WT_DECL_RET;
	WT_PAGE *page;
	int trigger, skip;

	bm = S2BT(session)->bm;

	WT_DSTAT_INCR(session, session_compact);

	WT_RET(__wt_config_gets(session, cfg, "trigger", &cval));
	trigger = (int)cval.val;

	/* Check if compaction might be useful. */
	WT_RET(bm->compact_skip(bm, session, trigger, &skip));
	if (skip)
		return (0);

	/*
	 * Walk the cache reviewing in-memory pages to see if they need to be
	 * re-written.  This requires looking at page reconciliation results,
	 * which means the page cannot be reconciled at the same time as it's
	 * being reviewed for compaction.  The underlying functions ensure we
	 * don't collide with page eviction, but we need to make sure we don't
	 * collide with checkpoints either, they are the other operation that
	 * can reconcile a page.
	 */
	__wt_spin_lock(session, &S2C(session)->metadata_lock);
	WT_RET(__wt_bt_cache_op(session, NULL, WT_SYNC_COMPACT));
	__wt_spin_unlock(session, &S2C(session)->metadata_lock);

	/*
	 * Walk the tree, reviewing on-disk pages to see if they need to be
	 * re-written.
	 */
	for (page = NULL;;) {
		WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT));
		if (page == NULL)
			break;

		/*
		 * The only pages returned by the tree walk function are pages
		 * we want to re-write; mark the page and tree dirty.
		 */
		if ((ret = __wt_page_modify_init(session, page)) != 0) {
			WT_TRET(__wt_page_release(session, page));
			WT_RET(ret);
		}
		__wt_page_and_tree_modify_set(session, page);

		WT_DSTAT_INCR(session, btree_compact_rewrite);
	}

	return (0);
}
Пример #2
0
/*
 * __wt_update_serial_func --
 *	Server function to add an WT_UPDATE entry in the page array.
 */
int
__wt_update_serial_func(WT_SESSION_IMPL *session, void *args)
{
	WT_PAGE *page;
	WT_UPDATE **new_upd, *upd, **upd_entry, **upd_obsolete;
	uint32_t write_gen;

	__wt_update_unpack(
	    args, &page, &write_gen, &upd_entry, &new_upd, &upd, &upd_obsolete);

	/* Check the page's write-generation. */
	WT_RET(__wt_page_write_gen_check(session, page, write_gen));

	upd->next = *upd_entry;
	/*
	 * Publish: there must be a barrier to ensure the new entry's next
	 * pointer is set before we update the linked list.
	 */
	WT_PUBLISH(*upd_entry, upd);
	__wt_update_upd_taken(session, args, page);

	/*
	 * If the page needs an update array (column-store pages and inserts on
	 * row-store pages do not use the update array), our caller passed us
	 * one of the correct size.   Check the page still needs one (the write
	 * generation test should have caught that, though).
	 *
	 * NOTE: it is important to do this after publishing that the update is
	 * set.  Code can assume that if the array is set, it is non-empty.
	 */
	if (new_upd != NULL && page->u.row.upd == NULL) {
		page->u.row.upd = new_upd;
		__wt_update_new_upd_taken(session, args, page);
	}

	/* Discard obsolete WT_UPDATE structures. */
	*upd_obsolete = __wt_update_obsolete_check(session, upd->next);

	__wt_page_and_tree_modify_set(session, page);
	return (0);
}
Пример #3
0
/*
 * __wt_insert_serial_func --
 *	Server function to add an WT_INSERT entry to the page.
 */
int
__wt_insert_serial_func(WT_SESSION_IMPL *session, void *args)
{
	WT_INSERT *new_ins, ***ins_stack;
	WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead;
	WT_PAGE *page;
	uint32_t write_gen;
	u_int i, skipdepth;

	__wt_insert_unpack(args, &page, &write_gen, &insheadp,
	    &ins_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth);

	/* Check the page's write-generation. */
	WT_RET(__wt_page_write_gen_check(session, page, write_gen));

	/*
	 * Publish: First, point the new WT_INSERT item's skiplist references
	 * to the next elements in the insert list, then flush memory.  Second,
	 * update the skiplist elements that reference the new WT_INSERT item,
	 * this ensures the list is never inconsistent.
	 */
	if ((inshead = *insheadp) == NULL)
		inshead = new_inshead;
	for (i = 0; i < skipdepth; i++)
		new_ins->next[i] = *ins_stack[i];
	WT_WRITE_BARRIER();
	for (i = 0; i < skipdepth; i++) {
		if (inshead->tail[i] == NULL ||
		    ins_stack[i] == &inshead->tail[i]->next[i])
			inshead->tail[i] = new_ins;
		*ins_stack[i] = new_ins;
	}

	__wt_insert_new_ins_taken(session, args, page);

	/*
	 * If the insert head does not yet have an insert list, our caller
	 * passed us one.
	 *
	 * NOTE: it is important to do this after the item has been added to
	 * the list.  Code can assume that if the list is set, it is non-empty.
	 */
	if (*insheadp == NULL) {
		WT_PUBLISH(*insheadp, new_inshead);
		__wt_insert_new_inshead_taken(session, args, page);
	}

	/*
	 * If the page does not yet have an insert array, our caller passed
	 * us one.
	 *
	 * NOTE: it is important to do this after publishing the list entry.
	 * Code can assume that if the array is set, it is non-empty.
	 */
	if (page->type == WT_PAGE_ROW_LEAF) {
		if (page->u.row.ins == NULL) {
			page->u.row.ins = new_inslist;
			__wt_insert_new_inslist_taken(session, args, page);
		}
	} else
		if (page->modify->update == NULL) {
			page->modify->update = new_inslist;
			__wt_insert_new_inslist_taken(session, args, page);
		}
	__wt_page_and_tree_modify_set(session, page);
	return (0);
}
Пример #4
0
/*
 * __wt_col_append_serial_func --
 *	Server function to append an WT_INSERT entry to the tree.
 */
int
__wt_col_append_serial_func(WT_SESSION_IMPL *session, void *args)
{
	WT_BTREE *btree;
	WT_INSERT *ins, *new_ins, ***ins_stack, **next_stack;
	WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead;
	WT_PAGE *page;
	uint64_t recno;
	uint32_t write_gen;
	u_int i, skipdepth;

	btree = S2BT(session);

	__wt_col_append_unpack(args,
	    &page, &write_gen, &insheadp, &ins_stack, &next_stack,
	    &new_inslist, &new_inshead, &new_ins, &skipdepth);

	/* Check the page's write-generation. */
	WT_RET(__wt_page_write_gen_check(session, page, write_gen));

	if ((inshead = *insheadp) == NULL)
		inshead = new_inshead;

	/*
	 * If the application specified a record number, there's a race: the
	 * application may have searched for the record, not found it, then
	 * called into the append code, and another thread might have added
	 * the record.  Fortunately, we're in the right place because if the
	 * record didn't exist at some point, it can only have been created
	 * on this list.  Search for the record, if specified.
	 */
	if ((recno = WT_INSERT_RECNO(new_ins)) == 0)
		recno = WT_INSERT_RECNO(new_ins) = ++btree->last_recno;

	ins = __col_insert_search(inshead, ins_stack, next_stack, recno);

	/* If we find the record number, there's been a race. */
	if (ins != NULL && WT_INSERT_RECNO(ins) == recno)
		WT_RET(WT_RESTART);

	/*
	 * Publish: First, point the new WT_INSERT item's skiplist references
	 * to the next elements in the insert list, then flush memory.  Second,
	 * update the skiplist elements that reference the new WT_INSERT item,
	 * this ensures the list is never inconsistent.
	 */
	for (i = 0; i < skipdepth; i++)
		new_ins->next[i] = *ins_stack[i];
	WT_WRITE_BARRIER();
	for (i = 0; i < skipdepth; i++) {
		if (inshead->tail[i] == NULL ||
		    ins_stack[i] == &inshead->tail[i]->next[i])
			inshead->tail[i] = new_ins;
		*ins_stack[i] = new_ins;
	}

	__wt_col_append_new_ins_taken(args);

	/*
	 * If the insert head does not yet have an insert list, our caller
	 * passed us one.
	 *
	 * NOTE: it is important to do this after the item has been added to
	 * the list.  Code can assume that if the list is set, it is non-empty.
	 */
	if (*insheadp == NULL) {
		WT_PUBLISH(*insheadp, new_inshead);
		__wt_col_append_new_inshead_taken(args);
	}

	/*
	 * If the page does not yet have an insert array, our caller passed
	 * us one.
	 *
	 * NOTE: it is important to do this after publishing the list entry.
	 * Code can assume that if the array is set, it is non-empty.
	 */
	if (page->modify->append == NULL) {
		page->modify->append = new_inslist;
		__wt_col_append_new_inslist_taken(args);
	}

	/*
	 * If we don't find the record, check to see if we extended the file,
	 * and update the last record number.
	 */
	if (recno > btree->last_recno)
		btree->last_recno = recno;

	__wt_page_and_tree_modify_set(session, page);
	return (0);
}
Пример #5
0
/*
 * __wt_compact_evict --
 *	Helper routine to decide if a file's size would benefit from re-writing
 * this page.
 */
int
__wt_compact_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BM *bm;
	WT_PAGE_MODIFY *mod;
	int skip;
	uint32_t addr_size;
	const uint8_t *addr;

	bm = S2BT(session)->bm;
	mod = page->modify;

	/*
	 * We have to review page reconciliation information as an in-memory
	 * page's original disk addresses might have been fine for compaction
	 * but its replacement addresses might be a problem.  To review page
	 * reconciliation information, we have to lock out both eviction and
	 * checkpoints, as those are the other two operations that can write
	 * a page.
	 *
	 * Ignore the root: it may not have a replacement address, and besides,
	 * if anything else gets written, so will it.
	 */
	if (WT_PAGE_IS_ROOT(page))
		return (0);

	/*
	 * If the page is already dirty, skip some work, it will be written in
	 * any case.
	 */
	if (__wt_page_is_modified(page))
		return (0);

	/*
	 * If the page is clean, test the original addresses.
	 * If the page is a 1-to-1 replacement, test the replacement addresses.
	 * If the page is a split, ignore it, it will be merged into the parent.
	 */
	if (mod == NULL)
		goto disk;

	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
	case 0:
disk:		__wt_get_addr(page->parent, page->ref, &addr, &addr_size);
		if (addr == NULL)
			return (0);
		WT_RET(
		    bm->compact_page_skip(bm, session, addr, addr_size, &skip));
		if (skip)
			return (0);
		break;
	case WT_PM_REC_EMPTY:
		return (0);
	case WT_PM_REC_REPLACE:
		WT_RET(bm->compact_page_skip(bm,
		    session, mod->u.replace.addr, mod->u.replace.size, &skip));
		if (skip)
			return (0);
		break;
	case WT_PM_REC_SPLIT:
	case WT_PM_REC_SPLIT_MERGE:
		return (0);
	}

	/* Mark the page and tree dirty, we want to write this page. */
	WT_RET(__wt_page_modify_init(session, page));
	__wt_page_and_tree_modify_set(session, page);

	WT_DSTAT_INCR(session, btree_compact_rewrite);
	return (0);
}