Exemple #1
0
/*
 * __wt_delete_page_skip --
 *	If iterating a cursor, skip deleted pages that are visible to us.
 */
bool
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
{
	bool skip;

	/*
	 * Deleted pages come from two sources: either it's a fast-delete as
	 * described above, or the page has been emptied by other operations
	 * and eviction deleted it.
	 *
	 * In both cases, the WT_REF state will be WT_REF_DELETED.  In the case
	 * of a fast-delete page, there will be a WT_PAGE_DELETED structure with
	 * the transaction ID of the transaction that deleted the page, and the
	 * page is visible if that transaction ID is visible.  In the case of an
	 * empty page, there will be no WT_PAGE_DELETED structure and the delete
	 * is by definition visible, eviction could not have deleted the page if
	 * there were changes on it that were not globally visible.
	 *
	 * We're here because we found a WT_REF state set to WT_REF_DELETED.  It
	 * is possible the page is being read into memory right now, though, and
	 * the page could switch to an in-memory state at any time.  Lock down
	 * the structure, just to be safe.
	 */
	if (ref->page_del == NULL)
		return (true);

	if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		return (false);

	skip = (ref->page_del == NULL ||
	    __wt_txn_visible(session, ref->page_del->txnid));

	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (skip);
}
Exemple #2
0
/*
 * __wt_delete_page_rollback --
 *	Abort pages that were deleted without being instantiated.
 */
void
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_UPDATE **upd;

	/*
	 * If the page is still "deleted", it's as we left it, reset the state
	 * to on-disk and we're done.  Otherwise, we expect the page is either
	 * instantiated or being instantiated.  Loop because it's possible for
	 * the page to return to the deleted state if instantiation fails.
	 */
	for (;; __wt_yield())
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_READING:
			WT_ASSERT(session, 0);		/* Impossible, assert */
			break;
		case WT_REF_DELETED:
			/*
			 * If the page is still "deleted", it's as we left it,
			 * reset the state.
			 */
			if (__wt_atomic_casv32(
			    &ref->state, WT_REF_DELETED, WT_REF_DISK))
				return;
			break;
		case WT_REF_LOCKED:
			/*
			 * A possible state, the page is being instantiated.
			 */
			break;
		case WT_REF_MEM:
		case WT_REF_SPLIT:
			/*
			 * We can't use the normal read path to get a copy of
			 * the page because the session may have closed the
			 * cursor, we no longer have the reference to the tree
			 * required for a hazard pointer.  We're safe because
			 * with unresolved transactions, the page isn't going
			 * anywhere.
			 *
			 * The page is in an in-memory state, walk the list of
			 * update structures and abort them.
			 */
			for (upd =
			    ref->page_del->update_list; *upd != NULL; ++upd)
				(*upd)->txnid = WT_TXN_ABORTED;

			/*
			 * Discard the memory, the transaction can't abort
			 * twice.
			 */
			__wt_free(session, ref->page_del->update_list);
			__wt_free(session, ref->page_del);
			return;
		}
}
Exemple #3
0
/*
 * __wt_delete_page_skip --
 *	If iterating a cursor, skip deleted pages that are either visible to
 * us or globally visible.
 */
bool
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
{
	bool skip;

	/*
	 * Deleted pages come from two sources: either it's a fast-delete as
	 * described above, or the page has been emptied by other operations
	 * and eviction deleted it.
	 *
	 * In both cases, the WT_REF state will be WT_REF_DELETED.  In the case
	 * of a fast-delete page, there will be a WT_PAGE_DELETED structure with
	 * the transaction ID of the transaction that deleted the page, and the
	 * page is visible if that transaction ID is visible.  In the case of an
	 * empty page, there will be no WT_PAGE_DELETED structure and the delete
	 * is by definition visible, eviction could not have deleted the page if
	 * there were changes on it that were not globally visible.
	 *
	 * We're here because we found a WT_REF state set to WT_REF_DELETED.  It
	 * is possible the page is being read into memory right now, though, and
	 * the page could switch to an in-memory state at any time.  Lock down
	 * the structure, just to be safe.
	 */
	if (ref->page_del == NULL)
		return (true);

	if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		return (false);

	skip = ref->page_del == NULL || (visible_all ?
	    __wt_txn_visible_all(session, ref->page_del->txnid,
		WT_TIMESTAMP_NULL(&ref->page_del->timestamp)):
	    __wt_txn_visible(session, ref->page_del->txnid,
		WT_TIMESTAMP_NULL(&ref->page_del->timestamp)));

	/*
	 * The page_del structure can be freed as soon as the delete is stable:
	 * it is only read when the ref state is WT_REF_DELETED.  It is worth
	 * checking every time we come through because once this is freed, we
	 * no longer need synchronization to check the ref.
	 */
	if (skip && ref->page_del != NULL && (visible_all ||
	    __wt_txn_visible_all(session, ref->page_del->txnid,
		WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) {
		__wt_free(session, ref->page_del->update_list);
		__wt_free(session, ref->page_del);
	}

	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (skip);
}
Exemple #4
0
/*
 * __wt_delete_page --
 *	If deleting a range, try to delete the page without instantiating it.
 */
int
__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	*skipp = false;

	/* If we have a clean page in memory, attempt to evict it. */
	if (ref->state == WT_REF_MEM &&
	    __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
		if (__wt_page_is_modified(ref->page)) {
			WT_PUBLISH(ref->state, WT_REF_MEM);
			return (0);
		}

		(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
		ret = __wt_evict_page(session, ref);
		(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
		WT_RET_BUSY_OK(ret);
	}

	/*
	 * Atomically switch the page's state to lock it.  If the page is not
	 * on-disk, other threads may be using it, no fast delete.
	 *
	 * Possible optimization: if the page is already deleted and the delete
	 * is visible to us (the delete has been committed), we could skip the
	 * page instead of instantiating it and figuring out there are no rows
	 * in the page.  While that's a huge amount of work to no purpose, it's
	 * unclear optimizing for overlapping range deletes is worth the effort.
	 */
	if (ref->state != WT_REF_DISK ||
	    !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * We cannot fast-delete pages that have overflow key/value items as
	 * the overflow blocks have to be discarded.  The way we figure that
	 * out is to check the on-page cell type for the page, cells for leaf
	 * pages that have no overflow items are special.
	 *
	 * In some cases, the reference address may not reference an on-page
	 * cell (for example, some combination of page splits), in which case
	 * we can't check the original cell value and we fail.
	 *
	 * To look at an on-page cell, we need to look at the parent page, and
	 * that's dangerous, our parent page could change without warning if
	 * the parent page were to split, deepening the tree.  It's safe: the
	 * page's reference will always point to some valid page, and if we find
	 * any problems we simply fail the fast-delete optimization.
	 *
	 * !!!
	 * I doubt it's worth the effort, but we could copy the cell's type into
	 * the reference structure, and then we wouldn't need an on-page cell.
	 */
	parent = ref->home;
	if (__wt_off_page(parent, ref->addr) ||
	    __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
		goto err;

	/*
	 * This action dirties the parent page: mark it dirty now, there's no
	 * future reconciliation of the child leaf page that will dirty it as
	 * we write the tree.
	 */
	WT_ERR(__wt_page_parent_modify_set(session, ref, false));

	/*
	 * Record the change in the transaction structure and set the change's
	 * transaction ID.
	 */
	WT_ERR(__wt_calloc_one(session, &ref->page_del));
	ref->page_del->txnid = session->txn.id;

	WT_ERR(__wt_txn_modify_ref(session, ref));

	*skipp = true;
	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (0);

err:	__wt_free(session, ref->page_del);

	/*
	 * Restore the page to on-disk status, we'll have to instantiate it.
	 */
	WT_PUBLISH(ref->state, WT_REF_DISK);
	return (ret);
}
Exemple #5
0
/*
 * __page_read --
 *	Read a page from the file.
 */
static int
__page_read(WT_SESSION_IMPL *session, WT_REF *ref)
{
	const WT_PAGE_HEADER *dsk;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_ITEM tmp;
	WT_PAGE *page;
	size_t addr_size;
	uint32_t previous_state;
	const uint8_t *addr;

	btree = S2BT(session);
	page = NULL;

	/*
	 * Don't pass an allocated buffer to the underlying block read function,
	 * force allocation of new memory of the appropriate size.
	 */
	WT_CLEAR(tmp);

	/*
	 * Attempt to set the state to WT_REF_READING for normal reads, or
	 * WT_REF_LOCKED, for deleted pages.  If successful, we've won the
	 * race, read the page.
	 */
	if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING))
		previous_state = WT_REF_DISK;
	else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		previous_state = WT_REF_DELETED;
	else
		return (0);

	/*
	 * Get the address: if there is no address, the page was deleted, but a
	 * subsequent search or insert is forcing re-creation of the name space.
	 */
	WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
	if (addr == NULL) {
		WT_ASSERT(session, previous_state == WT_REF_DELETED);

		WT_ERR(__wt_btree_new_leaf_page(session, &page));
		ref->page = page;
		goto done;
	}

	/*
	 * There's an address, read or map the backing disk page and build an
	 * in-memory version of the page.
	 */
	WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
	WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
	    WT_DATA_IN_ITEM(&tmp) ?
	    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));

	/*
	 * Clear the local reference to an allocated copy of the disk image on
	 * return; the page steals it, errors in this code should not free it.
	 */
	tmp.mem = NULL;

	/*
	 * If reading for a checkpoint, there's no additional work to do, the
	 * page on disk is correct as written.
	 */
	if (session->dhandle->checkpoint != NULL)
		goto done;

	/* If the page was deleted, instantiate that information. */
	if (previous_state == WT_REF_DELETED)
		WT_ERR(__wt_delete_page_instantiate(session, ref));

	/*
	 * Instantiate updates from the database's lookaside table. The page
	 * flag was set when the page was written, potentially a long time ago.
	 * We only care if the lookaside table is currently active, check that
	 * before doing any work.
	 */
	dsk = tmp.data;
	if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) {
		WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside);
		WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside);

		WT_ERR(__las_page_instantiate(
		    session, ref, btree->id, addr, addr_size));
	}

done:	WT_PUBLISH(ref->state, WT_REF_MEM);
	return (0);

err:	/*
	 * If the function building an in-memory version of the page failed,
	 * it discarded the page, but not the disk image.  Discard the page
	 * and separately discard the disk image in all cases.
	 */
	if (ref->page != NULL)
		__wt_ref_out(session, ref);
	WT_PUBLISH(ref->state, previous_state);

	__wt_buf_free(session, &tmp);

	return (ret);
}
Exemple #6
0
/*
 * __wt_delete_page --
 *	If deleting a range, try to delete the page without instantiating it.
 */
int
__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	*skipp = false;

	/* If we have a clean page in memory, attempt to evict it. */
	if (ref->state == WT_REF_MEM &&
	    __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
		if (__wt_page_is_modified(ref->page)) {
			WT_PUBLISH(ref->state, WT_REF_MEM);
			return (0);
		}

		(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
		ret = __wt_evict(session, ref, false);
		(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
		WT_RET_BUSY_OK(ret);
	}

	/*
	 * Atomically switch the page's state to lock it.  If the page is not
	 * on-disk, other threads may be using it, no fast delete.
	 */
	if (ref->state != WT_REF_DISK ||
	    !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * We cannot fast-delete pages that have overflow key/value items as
	 * the overflow blocks have to be discarded.  The way we figure that
	 * out is to check the page's cell type, cells for leaf pages without
	 * overflow items are special.
	 *
	 * To look at an on-page cell, we need to look at the parent page, and
	 * that's dangerous, our parent page could change without warning if
	 * the parent page were to split, deepening the tree.  It's safe: the
	 * page's reference will always point to some valid page, and if we find
	 * any problems we simply fail the fast-delete optimization.
	 */
	parent = ref->home;
	if (__wt_off_page(parent, ref->addr) ?
	    ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO :
	    __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
		goto err;

	/*
	 * This action dirties the parent page: mark it dirty now, there's no
	 * future reconciliation of the child leaf page that will dirty it as
	 * we write the tree.
	 */
	WT_ERR(__wt_page_parent_modify_set(session, ref, false));

	/*
	 * Record the change in the transaction structure and set the change's
	 * transaction ID.
	 */
	WT_ERR(__wt_calloc_one(session, &ref->page_del));
	ref->page_del->txnid = session->txn.id;

	WT_ERR(__wt_txn_modify_ref(session, ref));

	*skipp = true;
	WT_STAT_CONN_INCR(session, rec_page_delete_fast);
	WT_STAT_DATA_INCR(session, rec_page_delete_fast);
	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (0);

err:	__wt_free(session, ref->page_del);

	/*
	 * Restore the page to on-disk status, we'll have to instantiate it.
	 */
	WT_PUBLISH(ref->state, WT_REF_DISK);
	return (ret);
}
Exemple #7
0
/*
 * __wt_delete_page_rollback --
 *	Abort pages that were deleted without being instantiated.
 */
void
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_UPDATE **upd;
	uint64_t sleep_count, yield_count;

	/*
	 * If the page is still "deleted", it's as we left it, reset the state
	 * to on-disk and we're done.  Otherwise, we expect the page is either
	 * instantiated or being instantiated.  Loop because it's possible for
	 * the page to return to the deleted state if instantiation fails.
	 */
	for (sleep_count = yield_count = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_LOOKASIDE:
		case WT_REF_READING:
			WT_ASSERT(session, 0);		/* Impossible, assert */
			break;
		case WT_REF_DELETED:
			/*
			 * If the page is still "deleted", it's as we left it,
			 * reset the state.
			 */
			if (__wt_atomic_casv32(
			    &ref->state, WT_REF_DELETED, WT_REF_DISK))
				return;
			break;
		case WT_REF_LOCKED:
			/*
			 * A possible state, the page is being instantiated.
			 */
			break;
		case WT_REF_MEM:
		case WT_REF_SPLIT:
			/*
			 * We can't use the normal read path to get a copy of
			 * the page because the session may have closed the
			 * cursor, we no longer have the reference to the tree
			 * required for a hazard pointer.  We're safe because
			 * with unresolved transactions, the page isn't going
			 * anywhere.
			 *
			 * The page is in an in-memory state, walk the list of
			 * update structures and abort them.
			 */
			for (upd =
			    ref->page_del->update_list; *upd != NULL; ++upd)
				(*upd)->txnid = WT_TXN_ABORTED;

			/*
			 * Discard the memory, the transaction can't abort
			 * twice.
			 */
			__wt_free(session, ref->page_del->update_list);
			__wt_free(session, ref->page_del);
			return;
		}
		/*
		 * We wait for the change in page state, yield before retrying,
		 * and if we've yielded enough times, start sleeping so we don't
		 * burn CPU to no purpose.
		 */
		__wt_ref_state_yield_sleep(&yield_count, &sleep_count);
		WT_STAT_CONN_INCRV(session, page_del_rollback_blocked,
		    sleep_count);
	}
}
Exemple #8
0
/*
 * __wt_compact_page_skip --
 *	Return if compaction requires we read this page.
 */
int
__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	size_t addr_size;
	u_int type;
	const uint8_t *addr;

	/*
	 * Skip deleted pages, rewriting them doesn't seem useful; in a better
	 * world we'd write the parent to delete the page.
	 */
	if (ref->state == WT_REF_DELETED) {
		*skipp = true;
		return (0);
	}

	*skipp = false;				/* Default to reading */

	/*
	 * If the page is in-memory, we want to look at it (it may have been
	 * modified and written, and the current location is the interesting
	 * one in terms of compaction, not the original location).
	 *
	 * This test could be combined with the next one, but this is a cheap
	 * test and the next one is expensive.
	 */
	if (ref->state != WT_REF_DISK)
		return (0);

	/*
	 * There's nothing to prevent the WT_REF state from changing underfoot,
	 * which can change its address. For example, the WT_REF address might
	 * reference an on-page cell, and page eviction can free that memory.
	 * Lock the WT_REF so we can look at its address.
	 */
	if (!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * The page is on disk, so there had better be an address; assert that
	 * fact, test at run-time to avoid the core dump.
	 *
	 * Internal pages must be read to walk the tree; ask the block-manager
	 * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite
	 * won't help.
	 */
	__wt_ref_info(ref, &addr, &addr_size, &type);
	WT_ASSERT(session, addr != NULL);
	if (addr != NULL && type != WT_CELL_ADDR_INT) {
		bm = S2BT(session)->bm;
		ret = bm->compact_page_skip(
		    bm, session, addr, addr_size, skipp);
	}

	/*
	 * Reset the WT_REF state and push the change. The full-barrier isn't
	 * necessary, but it's better to keep pages in circulation than not.
	 */
	ref->state = WT_REF_DISK;
	WT_FULL_BARRIER();

	return (ret);
}