Beispiel #1
0
/*
 * __wt_compact_page_skip --
 *	Return if compaction requires we read this page.
 */
int
__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
    WT_BM *bm;
    size_t addr_size;
    u_int type;
    const uint8_t *addr;

    *skipp = false;				/* Default to reading. */
    type = 0;				/* Keep compiler quiet. */

    bm = S2BT(session)->bm;

    /*
     * We aren't holding a hazard pointer, so we can't look at the page
     * itself, all we can look at is the WT_REF information.  If there's no
     * address, the page isn't on disk, but we have to read internal pages
     * to walk the tree regardless; throw up our hands and read it.
     */
    __wt_ref_info(ref, &addr, &addr_size, &type);
    if (addr == NULL)
        return (0);

    /*
     * Internal pages must be read to walk the tree; ask the block-manager
     * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite
     * won't help.
     */
    return (type == WT_CELL_ADDR_INT ? 0 :
            bm->compact_page_skip(bm, session, addr, addr_size, skipp));
}
Beispiel #2
0
/*
 * __compact_rewrite --
 *	Return if a page needs to be re-written.
 */
static int
__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	size_t addr_size;
	const uint8_t *addr;

	*skipp = true;					/* Default skip. */

	bm = S2BT(session)->bm;
	page = ref->page;
	mod = page->modify;

	/*
	 * Ignore the root: it may not have a replacement address, and besides,
	 * if anything else gets written, so will it.
	 */
	if (__wt_ref_is_root(ref))
		return (0);

	/* Ignore currently dirty pages, they will be written regardless. */
	if (__wt_page_is_modified(page))
		return (0);

	/*
	 * If the page is clean, test the original addresses.
	 * If the page is a 1-to-1 replacement, test the replacement addresses.
	 * Ignore empty pages, they get merged into the parent.
	 */
	if (mod == NULL || mod->rec_result == 0) {
		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
		if (addr == NULL)
			return (0);
		WT_RET(
		    bm->compact_page_skip(bm, session, addr, addr_size, skipp));
	} else if (mod->rec_result == WT_PM_REC_REPLACE) {
		/*
		 * The page's modification information can change underfoot if
		 * the page is being reconciled, serialize with reconciliation.
		 */
		WT_RET(__wt_fair_lock(session, &page->page_lock));

		ret = bm->compact_page_skip(bm, session,
		    mod->mod_replace.addr, mod->mod_replace.size, skipp);

		WT_TRET(__wt_fair_unlock(session, &page->page_lock));
		WT_RET(ret);
	}
	return (0);
}
Beispiel #3
0
/*
 * __wt_page_addr_string --
 *	Figure out a page's "address" and load a buffer with a printable,
 * nul-terminated representation of that address.
 */
const char *
__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf)
{
	size_t addr_size;
	const uint8_t *addr;

	if (__wt_ref_is_root(ref)) {
		buf->data = "[Root]";
		buf->size = strlen("[Root]");
		return (buf->data);
	}

	(void)__wt_ref_info(session, ref, &addr, &addr_size, NULL);
	return (__wt_addr_string(session, addr, addr_size, buf));
}
Beispiel #4
0
/*
 * __ref_is_leaf --
 *	Check if a reference is for a leaf page.
 */
static inline bool
__ref_is_leaf(WT_REF *ref)
{
	size_t addr_size;
	const uint8_t *addr;
	u_int type;

	/*
	 * If the page has a disk address, we can crack it to figure out if
	 * this page is a leaf page or not. If there's no address, the page
	 * isn't on disk and we don't know the page type.
	 */
	__wt_ref_info(ref, &addr, &addr_size, &type);
	return (addr == NULL ?
	    false : type == WT_CELL_ADDR_LEAF || type == WT_CELL_ADDR_LEAF_NO);
}
Beispiel #5
0
/*
 * __wt_page_addr_string --
 *	Figure out a page's "address" and load a buffer with a printable,
 * nul-terminated representation of that address.
 */
const char *
__wt_page_addr_string(WT_SESSION_IMPL *session, WT_ITEM *buf, WT_PAGE *page)
{
	uint32_t size;
	const uint8_t *addr;

	if (WT_PAGE_IS_ROOT(page)) {
		buf->data = "[Root]";
		buf->size = WT_STORE_SIZE(strlen("[Root]"));
		return (buf->data);
	}

	(void)__wt_ref_info(
	    session, page->parent, page->ref, &addr, &size, NULL);
	return (__wt_addr_string(session, buf, addr, size));
}
Beispiel #6
0
/*将多余的文件空间compact到合适的位置,如果ref在compact范围内,返回skip = 1,表示文件空间不能进行compact*/
static int __compact_rewrite(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	size_t addr_size;
	const uint8_t *addr;

	*skipp = 1;	

	bm = S2BT(session)->bm;
	page = ref->page;
	mod = page->modify;

	/*root page是不能被compact*/
	if (__wt_ref_is_root(ref))
		return 0;

	/*ref指向的是个脏页,不进行compact*/
	if (__wt_page_is_modified(page))
		return (0);

	/*假如page一已经被清空的,直接判断是否可以它的block空间compact*/
	if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) {
		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
		if (addr == NULL)
			return (0);
		WT_RET(bm->compact_page_skip(bm, session, addr, addr_size, skipp));
	}
	else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE){ /*如果page空间是替换,那么进行替换block的compact操作判断*/
		WT_PAGE_LOCK(session, page);
		ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp);
		WT_PAGE_UNLOCK(session, page);
		WT_RET(ret);
	}

	return 0;
}
Beispiel #7
0
/*在读取ref对应的page时,检查它是否需要compact*/
int __wt_compact_page_skip(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp)
{
	WT_BM *bm;
	size_t addr_size;
	u_int type;
	const uint8_t *addr;

	*skipp = 0;				
	type = 0;

	bm = S2BT(session)->bm;

	/*
	* We aren't holding a hazard pointer, so we can't look at the page
	* itself, all we can look at is the WT_REF information.  If there's no
	* address, the page isn't on disk, but we have to read internal pages
	* to walk the tree regardless; throw up our hands and read it.
	*/
	WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type));
	if (addr == NULL)
		return 0;

	return (type == WT_CELL_ADDR_INT ? 0 : bm->compact_page_skip(bm, session, addr, addr_size, skipp));
}
Beispiel #8
0
/*
 * __page_read --
 *	Read a page from the file.
 */
static int
__page_read(WT_SESSION_IMPL *session, WT_REF *ref)
{
	const WT_PAGE_HEADER *dsk;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_ITEM tmp;
	WT_PAGE *page;
	size_t addr_size;
	uint32_t previous_state;
	const uint8_t *addr;

	btree = S2BT(session);
	page = NULL;

	/*
	 * Don't pass an allocated buffer to the underlying block read function,
	 * force allocation of new memory of the appropriate size.
	 */
	WT_CLEAR(tmp);

	/*
	 * Attempt to set the state to WT_REF_READING for normal reads, or
	 * WT_REF_LOCKED, for deleted pages.  If successful, we've won the
	 * race, read the page.
	 */
	if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING))
		previous_state = WT_REF_DISK;
	else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		previous_state = WT_REF_DELETED;
	else
		return (0);

	/*
	 * Get the address: if there is no address, the page was deleted, but a
	 * subsequent search or insert is forcing re-creation of the name space.
	 */
	WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
	if (addr == NULL) {
		WT_ASSERT(session, previous_state == WT_REF_DELETED);

		WT_ERR(__wt_btree_new_leaf_page(session, &page));
		ref->page = page;
		goto done;
	}

	/*
	 * There's an address, read or map the backing disk page and build an
	 * in-memory version of the page.
	 */
	WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
	WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
	    WT_DATA_IN_ITEM(&tmp) ?
	    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));

	/*
	 * Clear the local reference to an allocated copy of the disk image on
	 * return; the page steals it, errors in this code should not free it.
	 */
	tmp.mem = NULL;

	/*
	 * If reading for a checkpoint, there's no additional work to do, the
	 * page on disk is correct as written.
	 */
	if (session->dhandle->checkpoint != NULL)
		goto done;

	/* If the page was deleted, instantiate that information. */
	if (previous_state == WT_REF_DELETED)
		WT_ERR(__wt_delete_page_instantiate(session, ref));

	/*
	 * Instantiate updates from the database's lookaside table. The page
	 * flag was set when the page was written, potentially a long time ago.
	 * We only care if the lookaside table is currently active, check that
	 * before doing any work.
	 */
	dsk = tmp.data;
	if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) {
		WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside);
		WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside);

		WT_ERR(__las_page_instantiate(
		    session, ref, btree->id, addr, addr_size));
	}

done:	WT_PUBLISH(ref->state, WT_REF_MEM);
	return (0);

err:	/*
	 * If the function building an in-memory version of the page failed,
	 * it discarded the page, but not the disk image.  Discard the page
	 * and separately discard the disk image in all cases.
	 */
	if (ref->page != NULL)
		__wt_ref_out(session, ref);
	WT_PUBLISH(ref->state, previous_state);

	__wt_buf_free(session, &tmp);

	return (ret);
}
Beispiel #9
0
/*
 * __compact_rewrite --
 *	Return if a page needs to be re-written.
 */
static int
__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
    WT_BM *bm;
    WT_DECL_RET;
    WT_MULTI *multi;
    WT_PAGE *page;
    WT_PAGE_MODIFY *mod;
    size_t addr_size;
    uint32_t i;
    const uint8_t *addr;

    *skipp = true;					/* Default skip. */

    bm = S2BT(session)->bm;
    page = ref->page;
    mod = page->modify;

    /*
     * Ignore the root: it may not have a replacement address, and besides,
     * if anything else gets written, so will it.
     */
    if (__wt_ref_is_root(ref))
        return (0);

    /* Ignore currently dirty pages, they will be written regardless. */
    if (__wt_page_is_modified(page))
        return (0);

    /*
     * If the page is clean, test the original addresses.
     * If the page is a replacement, test the replacement addresses.
     * Ignore empty pages, they get merged into the parent.
     */
    if (mod == NULL || mod->rec_result == 0) {
        __wt_ref_info(ref, &addr, &addr_size, NULL);
        if (addr == NULL)
            return (0);
        return (
                   bm->compact_page_skip(bm, session, addr, addr_size, skipp));
    }

    /*
     * The page's modification information can change underfoot if the page
     * is being reconciled, serialize with reconciliation.
     */
    if (mod->rec_result == WT_PM_REC_REPLACE ||
            mod->rec_result == WT_PM_REC_MULTIBLOCK)
        WT_RET(__wt_fair_lock(session, &page->page_lock));

    if (mod->rec_result == WT_PM_REC_REPLACE)
        ret = bm->compact_page_skip(bm, session,
                                    mod->mod_replace.addr, mod->mod_replace.size, skipp);

    if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
        for (multi = mod->mod_multi,
                i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
            if (multi->disk_image != NULL)
                continue;
            if ((ret = bm->compact_page_skip(bm, session,
                                             multi->addr.addr, multi->addr.size, skipp)) != 0)
                break;
            if (!*skipp)
                break;
        }

    if (mod->rec_result == WT_PM_REC_REPLACE ||
            mod->rec_result == WT_PM_REC_MULTIBLOCK)
        WT_TRET(__wt_fair_unlock(session, &page->page_lock));

    return (ret);
}
Beispiel #10
0
/*
 * __wt_cache_read --
 *	Read a page from the file.
 */
int
__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_DECL_RET;
	WT_ITEM tmp;
	WT_PAGE *page;
	WT_PAGE_STATE previous_state;
	size_t addr_size;
	const uint8_t *addr;

	page = NULL;

	/*
	 * Don't pass an allocated buffer to the underlying block read function,
	 * force allocation of new memory of the appropriate size.
	 */
	WT_CLEAR(tmp);

	/*
	 * Attempt to set the state to WT_REF_READING for normal reads, or
	 * WT_REF_LOCKED, for deleted pages.  If successful, we've won the
	 * race, read the page.
	 */
	if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING))
		previous_state = WT_REF_DISK;
	else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		previous_state = WT_REF_DELETED;
	else
		return (0);

	/*
	 * Get the address: if there is no address, the page was deleted, but a
	 * subsequent search or insert is forcing re-creation of the name space.
	 * Otherwise, there's an address, read the backing disk page and build
	 * an in-memory version of the page.
	 */
	WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
	if (addr == NULL) {
		WT_ASSERT(session, previous_state == WT_REF_DELETED);

		WT_ERR(__wt_btree_new_leaf_page(session, &page));
		ref->page = page;
	} else {
		/* Read the backing disk page. */
		WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));

		/* Build the in-memory version of the page. */
		WT_ERR(__wt_page_inmem(session, ref, tmp.data,
		    WT_DATA_IN_ITEM(&tmp) ?
		    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));

		/* If the page was deleted, instantiate that information. */
		if (previous_state == WT_REF_DELETED)
			WT_ERR(__wt_delete_page_instantiate(session, ref));
	}

	WT_ERR(__wt_verbose(session, WT_VERB_READ,
	    "page %p: %s", page, __wt_page_type_string(page->type)));

	WT_PUBLISH(ref->state, WT_REF_MEM);
	return (0);

err:	/*
	 * If the function building an in-memory version of the page failed,
	 * it discarded the page, but not the disk image.  Discard the page
	 * and separately discard the disk image in all cases.
	 */
	if (ref->page != NULL)
		__wt_ref_out(session, ref);
	WT_PUBLISH(ref->state, previous_state);

	__wt_buf_free(session, &tmp);

	return (ret);
}
Beispiel #11
0
/*
 * __wt_compact_page_skip --
 *	Return if compaction requires we read this page.
 */
int
__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	size_t addr_size;
	u_int type;
	const uint8_t *addr;

	/*
	 * Skip deleted pages, rewriting them doesn't seem useful; in a better
	 * world we'd write the parent to delete the page.
	 */
	if (ref->state == WT_REF_DELETED) {
		*skipp = true;
		return (0);
	}

	*skipp = false;				/* Default to reading */

	/*
	 * If the page is in-memory, we want to look at it (it may have been
	 * modified and written, and the current location is the interesting
	 * one in terms of compaction, not the original location).
	 *
	 * This test could be combined with the next one, but this is a cheap
	 * test and the next one is expensive.
	 */
	if (ref->state != WT_REF_DISK)
		return (0);

	/*
	 * There's nothing to prevent the WT_REF state from changing underfoot,
	 * which can change its address. For example, the WT_REF address might
	 * reference an on-page cell, and page eviction can free that memory.
	 * Lock the WT_REF so we can look at its address.
	 */
	if (!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * The page is on disk, so there had better be an address; assert that
	 * fact, test at run-time to avoid the core dump.
	 *
	 * Internal pages must be read to walk the tree; ask the block-manager
	 * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite
	 * won't help.
	 */
	__wt_ref_info(ref, &addr, &addr_size, &type);
	WT_ASSERT(session, addr != NULL);
	if (addr != NULL && type != WT_CELL_ADDR_INT) {
		bm = S2BT(session)->bm;
		ret = bm->compact_page_skip(
		    bm, session, addr, addr_size, skipp);
	}

	/*
	 * Reset the WT_REF state and push the change. The full-barrier isn't
	 * necessary, but it's better to keep pages in circulation than not.
	 */
	ref->state = WT_REF_DISK;
	WT_FULL_BARRIER();

	return (ret);
}