Ejemplo n.º 1
0
/*
 * __wt_async_op_enqueue --
 *	Enqueue an operation onto the work queue.
 */
int
__wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op)
{
	WT_ASYNC *async;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	uint64_t cur_head, cur_tail, my_alloc, my_slot;
#ifdef	HAVE_DIAGNOSTIC
	WT_ASYNC_OP_IMPL *my_op;
#endif

	conn = S2C(session);
	async = conn->async;

	/*
	 * If an application re-uses a WT_ASYNC_OP, we end up here with an
	 * invalid object.
	 */
	if (op->state != WT_ASYNCOP_READY)
		WT_RET_MSG(session, EINVAL,
		    "application error: WT_ASYNC_OP already in use");

	/*
	 * Enqueue op at the tail of the work queue.
	 * We get our slot in the ring buffer to use.
	 */
	my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1);
	my_slot = my_alloc % async->async_qsize;

	/*
	 * Make sure we haven't wrapped around the queue.
	 * If so, wait for the tail to advance off this slot.
	 */
	WT_ORDERED_READ(cur_tail, async->tail_slot);
	while (cur_tail == my_slot) {
		__wt_yield();
		WT_ORDERED_READ(cur_tail, async->tail_slot);
	}

#ifdef	HAVE_DIAGNOSTIC
	WT_ORDERED_READ(my_op, async->async_queue[my_slot]);
	if (my_op != NULL)
		return (__wt_panic(session));
#endif
	WT_PUBLISH(async->async_queue[my_slot], op);
	op->state = WT_ASYNCOP_ENQUEUED;
	if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue)
		WT_PUBLISH(async->max_queue, async->cur_queue);
	/*
	 * Multiple threads may be adding ops to the queue.  We need to wait
	 * our turn to make our slot visible to workers.
	 */
	WT_ORDERED_READ(cur_head, async->head);
	while (cur_head != (my_alloc - 1)) {
		__wt_yield();
		WT_ORDERED_READ(cur_head, async->head);
	}
	WT_PUBLISH(async->head, my_alloc);
	return (ret);
}
Ejemplo n.º 2
0
/*
 * __rec_page_dirty_update --
 *	Update a dirty page's reference on eviction.
 */
static int
__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_ADDR *addr;
	WT_PAGE_MODIFY *mod;
	WT_REF *parent_ref;

	mod = page->modify;
	parent_ref = page->ref;

	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
	case WT_PM_REC_REPLACE: 			/* 1-for-1 page swap */
		if (parent_ref->addr != NULL &&
		    __wt_off_page(page->parent, parent_ref->addr)) {
			__wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr);
			__wt_free(session, parent_ref->addr);
		}

		/*
		 * Update the parent to reference the replacement page.
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
		*addr = mod->u.replace;
		mod->u.replace.addr = NULL;
		mod->u.replace.size = 0;

		parent_ref->page = NULL;
		parent_ref->addr = addr;
		WT_PUBLISH(parent_ref->state, WT_REF_DISK);
		break;
	case WT_PM_REC_SPLIT:				/* Page split */
		/*
		 * Update the parent to reference new internal page(s).
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		parent_ref->page = mod->u.split;
		WT_PUBLISH(parent_ref->state, WT_REF_MEM);

		/* Clear the reference else discarding the page will free it. */
		mod->u.split = NULL;
		F_CLR(mod, WT_PM_REC_SPLIT);
		break;
	case WT_PM_REC_EMPTY:				/* Page is empty */
		/* We checked if the page was empty when we reviewed it. */
		/* FALLTHROUGH */
	WT_ILLEGAL_VALUE(session);
	}

	return (0);
}
Ejemplo n.º 3
0
/*
 * __spin_lock_next_id --
 *	Return the next spinlock caller ID.
 */
static int
__spin_lock_next_id(WT_SESSION_IMPL *session, int *idp)
{
	static int lock_id = 0, next_id = 0;
	WT_DECL_RET;

	/* If we've ever registered this location, we already have an ID. */
	if (*idp != WT_SPINLOCK_REGISTER)
		return (0);

	/*
	 * We can't use the global spinlock to lock the ID allocation (duh!),
	 * use a CAS instruction to serialize access to a local variable.
	 * This work only gets done once per library instantiation, there
	 * isn't a performance concern.
	 */
	while (!WT_ATOMIC_CAS(lock_id, 0, 1))
		__wt_yield();

	/* Allocate a blocking ID for this location. */
	if (*idp == WT_SPINLOCK_REGISTER) {
		if (next_id < WT_SPINLOCK_MAX_LOCATION_ID)
			*idp = next_id++;
		else
			WT_ERR_MSG(session, ENOMEM,
			    "spinlock caller location registry failed, "
			    "increase the connection's blocking matrix size");
	}

err:	WT_PUBLISH(lock_id, 0);
	return (ret);
}
Ejemplo n.º 4
0
/*
 * __wt_delete_page_skip --
 *	If iterating a cursor, skip deleted pages that are visible to us.
 */
bool
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
{
	bool skip;

	/*
	 * Deleted pages come from two sources: either it's a fast-delete as
	 * described above, or the page has been emptied by other operations
	 * and eviction deleted it.
	 *
	 * In both cases, the WT_REF state will be WT_REF_DELETED.  In the case
	 * of a fast-delete page, there will be a WT_PAGE_DELETED structure with
	 * the transaction ID of the transaction that deleted the page, and the
	 * page is visible if that transaction ID is visible.  In the case of an
	 * empty page, there will be no WT_PAGE_DELETED structure and the delete
	 * is by definition visible, eviction could not have deleted the page if
	 * there were changes on it that were not globally visible.
	 *
	 * We're here because we found a WT_REF state set to WT_REF_DELETED.  It
	 * is possible the page is being read into memory right now, though, and
	 * the page could switch to an in-memory state at any time.  Lock down
	 * the structure, just to be safe.
	 */
	if (ref->page_del == NULL)
		return (true);

	if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		return (false);

	skip = (ref->page_del == NULL ||
	    __wt_txn_visible(session, ref->page_del->txnid));

	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (skip);
}
Ejemplo n.º 5
0
/*
 * __wt_txn_release --
 *	Release the resources associated with the current transaction.
 */
void
__wt_txn_release(WT_SESSION_IMPL *session)
{
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *txn_state;

	txn = &session->txn;
	txn->mod_count = 0;
	txn->notify = NULL;

	txn_global = &S2C(session)->txn_global;
	txn_state = &txn_global->states[session->id];

	/* Clear the transaction's ID from the global table. */
	WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
	    txn->id != WT_TXN_NONE);
	WT_PUBLISH(txn_state->id, WT_TXN_NONE);
	txn->id = WT_TXN_NONE;

	/*
	 * Reset the transaction state to not running.
	 *
	 * Auto-commit transactions (identified by having active cursors)
	 * handle this at a higher level.
	 */
	if (session->ncursors == 0)
		__wt_txn_release_snapshot(session);
	txn->isolation = session->isolation;
	F_CLR(txn, TXN_ERROR | TXN_OLDEST | TXN_RUNNING);
}
Ejemplo n.º 6
0
/*
 * __wt_log_slot_activate --
 *	Initialize a slot to become active.
 */
void
__wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;

	conn = S2C(session);
	log = conn->log;

	/*
	 * !!! slot_release_lsn must be set outside this function because
	 * this function may be called after a log file switch and the
	 * slot_release_lsn must refer to the end of the previous log.
	 * !!! We cannot initialize flags here because it may already be
	 * set for closing the file handle on a log file switch.  The flags
	 * are reset when the slot is freed.  See log_slot_free.
	 */
	slot->slot_unbuffered = 0;
	slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn;
	slot->slot_start_offset = log->alloc_lsn.l.offset;
	slot->slot_last_offset = log->alloc_lsn.l.offset;
	slot->slot_fh = log->log_fh;
	slot->slot_error = 0;
	WT_DIAGNOSTIC_YIELD;
	/*
	 * Set the slot state last.  Other threads may have a stale pointer
	 * to this slot and could try to alter the state and other fields once
	 * they see the state cleared.
	 */
	WT_PUBLISH(slot->slot_state, 0);
}
Ejemplo n.º 7
0
/*
 * __wt_posix_file_fallocate --
 *	POSIX fallocate.
 */
int
__wt_posix_file_fallocate(WT_FILE_HANDLE *file_handle,
    WT_SESSION *wt_session, wt_off_t offset, wt_off_t len)
{
	/*
	 * The first fallocate call: figure out what fallocate call this system
	 * supports, if any.
	 *
	 * The function is configured as a locking fallocate call, so we know
	 * we're single-threaded through here. Set the nolock function first,
	 * then publish the NULL replacement to ensure the handle functions are
	 * always correct.
	 *
	 * We've seen Linux systems where posix_fallocate has corrupted
	 * existing file data (even though that is explicitly disallowed
	 * by POSIX). FreeBSD and Solaris support posix_fallocate, and
	 * so far we've seen no problems leaving it unlocked. Check for
	 * fallocate (and the system call version of fallocate) first to
	 * avoid locking on Linux if at all possible.
	 */
	if (__posix_std_fallocate(file_handle, wt_session, offset, len) == 0) {
		file_handle->fh_allocate_nolock = __posix_std_fallocate;
		WT_PUBLISH(file_handle->fh_allocate, NULL);
		return (0);
	}
	if (__posix_sys_fallocate(file_handle, wt_session, offset, len) == 0) {
		file_handle->fh_allocate_nolock = __posix_sys_fallocate;
		WT_PUBLISH(file_handle->fh_allocate, NULL);
		return (0);
	}
	if (__posix_posix_fallocate(
	    file_handle, wt_session, offset, len) == 0) {
#if defined(__linux__)
		file_handle->fh_allocate = __posix_posix_fallocate;
		WT_WRITE_BARRIER();
#else
		file_handle->fh_allocate_nolock = __posix_posix_fallocate;
		WT_PUBLISH(file_handle->fh_allocate, NULL);
#endif
		return (0);
	}

	file_handle->fh_allocate = NULL;
	WT_WRITE_BARRIER();
	return (ENOTSUP);
}
Ejemplo n.º 8
0
/*
 * __rec_page_clean_update  --
 *	Update a clean page's reference on eviction.
 */
static void
__rec_page_clean_update(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	/* Update the relevant WT_REF structure. */
	page->ref->page = NULL;
	WT_PUBLISH(page->ref->state, WT_REF_DISK);

	WT_UNUSED(session);
}
Ejemplo n.º 9
0
/*
 * __wt_txn_release --
 *	Release the resources associated with the current transaction.
 */
void
__wt_txn_release(WT_SESSION_IMPL *session)
{
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *txn_state;

	txn = &session->txn;
	txn_global = &S2C(session)->txn_global;
	txn_state = WT_SESSION_TXN_STATE(session);

	WT_ASSERT(session, txn->mod_count == 0);
	txn->notify = NULL;

	/* Clear the transaction's ID from the global table. */
	if (WT_SESSION_IS_CHECKPOINT(session)) {
		WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
		txn->id = txn_global->checkpoint_state.id =
		    txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;

		/*
		 * Be extra careful to cleanup everything for checkpoints: once
		 * the global checkpoint ID is cleared, we can no longer tell
		 * if this session is doing a checkpoint.
		 */
		txn_global->checkpoint_id = 0;
	} else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
		WT_ASSERT(session,
		    !WT_TXNID_LT(txn->id, txn_global->last_running));

		WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
		    txn->id != WT_TXN_NONE);
		WT_PUBLISH(txn_state->id, WT_TXN_NONE);

		txn->id = WT_TXN_NONE;
	}

	__wt_txn_clear_commit_timestamp(session);
	__wt_txn_clear_read_timestamp(session);

	/* Free the scratch buffer allocated for logging. */
	__wt_logrec_free(session, &txn->logrec);

	/* Discard any memory from the session's stash that we can. */
	WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0);
	__wt_stash_discard(session);

	/*
	 * Reset the transaction state to not running and release the snapshot.
	 */
	__wt_txn_release_snapshot(session);
	txn->isolation = session->isolation;

	/* Ensure the transaction flags are cleared on exit */
	txn->flags = 0;
}
Ejemplo n.º 10
0
/*
 * __wt_bt_cache_op --
 *	Cache operations: compaction, discard, sync/checkpoint.
 */
int
__wt_bt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op)
{
	WT_DECL_RET;
	WT_BTREE *btree;

	btree = session->btree;

	/*
	 * Compaction and sync/checkpoint reconcile dirty pages from the cache
	 * to the backing block manager.  Reconciliation is just another reader
	 * of the page, so with some care, it can be done in the current thread,
	 * leaving the eviction thread to keep freeing spaces if the cache is
	 * full.  Sync and eviction cannot operate on the same page at the same
	 * time, and there are different modes inside __wt_tree_walk to make
	 * sure they don't trip over each other.
	 *
	 * The current thread cannot evict pages from the cache, so discard is
	 * done by calling the eviction server for service.
	 *
	 * XXX
	 * Set the checkpoint reference for reconciliation -- this is ugly, but
	 * there's no data structure path from here to reconciliation.
	 *
	 * Publish: there must be a barrier to ensure the structure fields are
	 * set before the eviction thread can see the request.
	 */
	WT_PUBLISH(btree->ckpt, ckptbase);

	switch (op) {
	case WT_SYNC_CHECKPOINT:
	case WT_SYNC_COMPACT:
	case WT_SYNC_WRITE_LEAVES:
		WT_ERR(__wt_sync_file(session, op));
		break;
	case WT_SYNC_DISCARD:
	case WT_SYNC_DISCARD_NOWRITE:
		/*
		 * Schedule and wake the eviction server, then wait for the
		 * eviction server to wake us.
		 */
		WT_ERR(__wt_sync_file_serial(session, op));
		WT_ERR(__wt_evict_server_wake(session));
		WT_ERR(__wt_cond_wait(session, session->cond, 0));
		ret = session->syncop_ret;

		/* If discarding the tree, the root page should be gone. */
		WT_ASSERT(session, ret != 0 || btree->root_page == NULL);
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

err:	btree->ckpt = NULL;
	return (ret);
}
Ejemplo n.º 11
0
/*
 * __wt_delete_page_skip --
 *	If iterating a cursor, skip deleted pages that are either visible to
 * us or globally visible.
 */
bool
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
{
	bool skip;

	/*
	 * Deleted pages come from two sources: either it's a fast-delete as
	 * described above, or the page has been emptied by other operations
	 * and eviction deleted it.
	 *
	 * In both cases, the WT_REF state will be WT_REF_DELETED.  In the case
	 * of a fast-delete page, there will be a WT_PAGE_DELETED structure with
	 * the transaction ID of the transaction that deleted the page, and the
	 * page is visible if that transaction ID is visible.  In the case of an
	 * empty page, there will be no WT_PAGE_DELETED structure and the delete
	 * is by definition visible, eviction could not have deleted the page if
	 * there were changes on it that were not globally visible.
	 *
	 * We're here because we found a WT_REF state set to WT_REF_DELETED.  It
	 * is possible the page is being read into memory right now, though, and
	 * the page could switch to an in-memory state at any time.  Lock down
	 * the structure, just to be safe.
	 */
	if (ref->page_del == NULL)
		return (true);

	if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		return (false);

	skip = ref->page_del == NULL || (visible_all ?
	    __wt_txn_visible_all(session, ref->page_del->txnid,
		WT_TIMESTAMP_NULL(&ref->page_del->timestamp)):
	    __wt_txn_visible(session, ref->page_del->txnid,
		WT_TIMESTAMP_NULL(&ref->page_del->timestamp)));

	/*
	 * The page_del structure can be freed as soon as the delete is stable:
	 * it is only read when the ref state is WT_REF_DELETED.  It is worth
	 * checking every time we come through because once this is freed, we
	 * no longer need synchronization to check the ref.
	 */
	if (skip && ref->page_del != NULL && (visible_all ||
	    __wt_txn_visible_all(session, ref->page_del->txnid,
		WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) {
		__wt_free(session, ref->page_del->update_list);
		__wt_free(session, ref->page_del);
	}

	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (skip);
}
Ejemplo n.º 12
0
/*
 * __merge_unlock --
 *	Unlock all pages under an internal page being merged.
 */
static void
__merge_unlock(WT_PAGE *page)
{
	WT_REF *ref;
	uint32_t i;

	WT_REF_FOREACH(page, ref, i)
		if (ref->state == WT_REF_LOCKED) {
			if (ref->page->type == WT_PAGE_ROW_INT ||
			    ref->page->type == WT_PAGE_COL_INT)
				__merge_unlock(ref->page);
			WT_PUBLISH(ref->state, WT_REF_MEM);
		}
}
Ejemplo n.º 13
0
/*
 * __rec_page_clean_update --
 *	Update a clean page's reference on eviction.
 */
static void
__rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *parent_ref)
{
	/*
	 * Update the WT_REF structure in the parent.  If the page has an
	 * address, it's a disk page; if it has no address, it must be a
	 * deleted page that was re-instantiated (for example, by searching)
	 * and never written.
	 */
	parent_ref->page = NULL;
	WT_PUBLISH(parent_ref->state,
	    parent_ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK);

	WT_UNUSED(session);
}
Ejemplo n.º 14
0
/*
 * __wt_txn_release --
 *	Release the resources associated with the current transaction.
 */
void
__wt_txn_release(WT_SESSION_IMPL *session)
{
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *txn_state;

	txn = &session->txn;
	WT_ASSERT(session, txn->mod_count == 0);
	txn->notify = NULL;

	txn_global = &S2C(session)->txn_global;
	txn_state = WT_SESSION_TXN_STATE(session);

	/* Clear the transaction's ID from the global table. */
	if (WT_SESSION_IS_CHECKPOINT(session)) {
		WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
		txn->id = WT_TXN_NONE;

		/* Clear the global checkpoint transaction IDs. */
		txn_global->checkpoint_id = 0;
		txn_global->checkpoint_pinned = WT_TXN_NONE;
	} else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
		WT_ASSERT(session,
		    !WT_TXNID_LT(txn->id, txn_global->last_running));

		WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
		    txn->id != WT_TXN_NONE);
		WT_PUBLISH(txn_state->id, WT_TXN_NONE);
		txn->id = WT_TXN_NONE;
	}

	/* Free the scratch buffer allocated for logging. */
	__wt_logrec_free(session, &txn->logrec);

	/* Discard any memory from the session's split stash that we can. */
	WT_ASSERT(session, session->split_gen == 0);
	if (session->split_stash_cnt > 0)
		__wt_split_stash_discard(session);

	/*
	 * Reset the transaction state to not running and release the snapshot.
	 */
	__wt_txn_release_snapshot(session);
	txn->isolation = session->isolation;
	/* Ensure the transaction flags are cleared on exit */
	txn->flags = 0;
}
Ejemplo n.º 15
0
/*
 * __wt_hazard_clear --
 *	Clear a hazard pointer.
 */
int
__wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BTREE *btree;
	WT_HAZARD *hp;

	btree = S2BT(session);

	/* If a file can never be evicted, hazard pointers aren't required. */
	if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
		return (0);

	/*
	 * Clear the caller's hazard pointer.
	 * The common pattern is LIFO, so do a reverse search.
	 */
	for (hp = session->hazard + session->hazard_size - 1;
	    hp >= session->hazard;
	    --hp)
		if (hp->page == page) {
			/*
			 * We don't publish the hazard pointer clear in the
			 * general case.  It's not required for correctness;
			 * it gives an eviction thread faster access to the
			 * page were the page selected for eviction, but the
			 * generation number was just set, it's unlikely the
			 * page will be selected for eviction.
			 */
			hp->page = NULL;

			/*
			 * If this was the last hazard pointer in the session,
			 * reset the size so that checks can skip this session.
			 */
			if (--session->nhazard == 0)
				WT_PUBLISH(session->hazard_size, 0);
			return (0);
		}

	/*
	 * A serious error, we should always find the hazard pointer.  Panic,
	 * because using a page we didn't have pinned down implies corruption.
	 */
	WT_PANIC_RET(session, EINVAL,
	    "session %p: clear hazard pointer: %p: not found",
	    (void *)session, (void *)page);
}
Ejemplo n.º 16
0
/*
 * __rec_page_clean_update  --
 *	Update a clean page's reference on eviction.
 */
static void
__rec_page_clean_update(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_REF *ref;

	ref = page->ref;

	/*
	 * Update the page's WT_REF structure.  If the page has an address, it's
	 * a disk page; if it has no address, it must be a deleted page that was
	 * re-instantiated (for example, by searching) and never written.
	 */
	ref->page = NULL;
	WT_PUBLISH(ref->state,
	    ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK);

	WT_UNUSED(session);
}
Ejemplo n.º 17
0
/*
 * __wt_update_serial_func --
 *	Server function to add an WT_UPDATE entry in the page array.
 */
int
__wt_update_serial_func(WT_SESSION_IMPL *session, void *args)
{
	WT_PAGE *page;
	WT_UPDATE **new_upd, *upd, **upd_entry, **upd_obsolete;
	uint32_t write_gen;

	__wt_update_unpack(
	    args, &page, &write_gen, &upd_entry, &new_upd, &upd, &upd_obsolete);

	/* Check the page's write-generation. */
	WT_RET(__wt_page_write_gen_check(session, page, write_gen));

	upd->next = *upd_entry;
	/*
	 * Publish: there must be a barrier to ensure the new entry's next
	 * pointer is set before we update the linked list.
	 */
	WT_PUBLISH(*upd_entry, upd);
	__wt_update_upd_taken(session, args, page);

	/*
	 * If the page needs an update array (column-store pages and inserts on
	 * row-store pages do not use the update array), our caller passed us
	 * one of the correct size.   Check the page still needs one (the write
	 * generation test should have caught that, though).
	 *
	 * NOTE: it is important to do this after publishing that the update is
	 * set.  Code can assume that if the array is set, it is non-empty.
	 */
	if (new_upd != NULL && page->u.row.upd == NULL) {
		page->u.row.upd = new_upd;
		__wt_update_new_upd_taken(session, args, page);
	}

	/* Discard obsolete WT_UPDATE structures. */
	*upd_obsolete = __wt_update_obsolete_check(session, upd->next);

	__wt_page_and_tree_modify_set(session, page);
	return (0);
}
Ejemplo n.º 18
0
/*
 * __wt_insert_serial_func --
 *	Server function to add an WT_INSERT entry to the page.
 */
int
__wt_insert_serial_func(WT_SESSION_IMPL *session, void *args)
{
	WT_INSERT *new_ins, ***ins_stack;
	WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead;
	WT_PAGE *page;
	uint32_t write_gen;
	u_int i, skipdepth;

	__wt_insert_unpack(args, &page, &write_gen, &insheadp,
	    &ins_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth);

	/* Check the page's write-generation. */
	WT_RET(__wt_page_write_gen_check(session, page, write_gen));

	/*
	 * Publish: First, point the new WT_INSERT item's skiplist references
	 * to the next elements in the insert list, then flush memory.  Second,
	 * update the skiplist elements that reference the new WT_INSERT item,
	 * this ensures the list is never inconsistent.
	 */
	if ((inshead = *insheadp) == NULL)
		inshead = new_inshead;
	for (i = 0; i < skipdepth; i++)
		new_ins->next[i] = *ins_stack[i];
	WT_WRITE_BARRIER();
	for (i = 0; i < skipdepth; i++) {
		if (inshead->tail[i] == NULL ||
		    ins_stack[i] == &inshead->tail[i]->next[i])
			inshead->tail[i] = new_ins;
		*ins_stack[i] = new_ins;
	}

	__wt_insert_new_ins_taken(session, args, page);

	/*
	 * If the insert head does not yet have an insert list, our caller
	 * passed us one.
	 *
	 * NOTE: it is important to do this after the item has been added to
	 * the list.  Code can assume that if the list is set, it is non-empty.
	 */
	if (*insheadp == NULL) {
		WT_PUBLISH(*insheadp, new_inshead);
		__wt_insert_new_inshead_taken(session, args, page);
	}

	/*
	 * If the page does not yet have an insert array, our caller passed
	 * us one.
	 *
	 * NOTE: it is important to do this after publishing the list entry.
	 * Code can assume that if the array is set, it is non-empty.
	 */
	if (page->type == WT_PAGE_ROW_LEAF) {
		if (page->u.row.ins == NULL) {
			page->u.row.ins = new_inslist;
			__wt_insert_new_inslist_taken(session, args, page);
		}
	} else
		if (page->modify->update == NULL) {
			page->modify->update = new_inslist;
			__wt_insert_new_inslist_taken(session, args, page);
		}
	__wt_page_and_tree_modify_set(session, page);
	return (0);
}
Ejemplo n.º 19
0
/*
 * __wt_lsm_checkpoint_chunk --
 *	Flush a single LSM chunk to disk.
 */
int
__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
	WT_DECL_RET;
	WT_TXN_ISOLATION saved_isolation;
	bool flush_set;

	flush_set = false;

	/*
	 * If the chunk is already checkpointed, make sure it is also evicted.
	 * Either way, there is no point trying to checkpoint it again.
	 */
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
	    !chunk->evicted) {
		WT_WITH_HANDLE_LIST_LOCK(session,
		    ret = __lsm_discard_handle(session, chunk->uri, NULL));
		if (ret == 0)
			chunk->evicted = 1;
		else if (ret == EBUSY)
			ret = 0;
		else
			WT_RET_MSG(session, ret, "discard handle");
	}
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
		WT_RET(__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s already on disk",
		    chunk->uri));
		return (0);
	}

	/* Stop if a running transaction needs the chunk. */
	__wt_txn_update_oldest(session, true);
	if (chunk->switch_txn == WT_TXN_NONE ||
	    !__wt_txn_visible_all(session, chunk->switch_txn)) {
		WT_RET(__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s: running transaction, return",
		    chunk->uri));
		return (0);
	}

	if (!__wt_atomic_cas8(&chunk->flushing, 0, 1))
		return (0);
	flush_set = true;

	WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
	    chunk->uri));

	/*
	 * Flush the file before checkpointing: this is the expensive part in
	 * terms of I/O.
	 *
	 * !!!
	 * We can wait here for checkpoints and fsyncs to complete, which can
	 * take a long time.
	 */
	if ((ret = __wt_session_get_btree(
	    session, chunk->uri, NULL, NULL, 0)) == 0) {
		/*
		 * Set read-uncommitted: we have already checked that all of the
		 * updates in this chunk are globally visible, use the cheapest
		 * possible check in reconciliation.
		 */
		saved_isolation = session->txn.isolation;
		session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
		ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
		session->txn.isolation = saved_isolation;
		WT_TRET(__wt_session_release_btree(session));
	}
	WT_ERR(ret);

	WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
	    chunk->uri));

	/*
	 * Turn on metadata tracking to ensure the checkpoint gets the
	 * necessary handle locks.
	 *
	 * Ensure that we don't race with a running checkpoint: the checkpoint
	 * lock protects against us racing with an application checkpoint in
	 * this chunk.  Don't wait for it, though: checkpoints can take a long
	 * time, and our checkpoint operation should be very quick.
	 */
	WT_ERR(__wt_meta_track_on(session));
	WT_WITH_CHECKPOINT_LOCK(session, ret,
	    WT_WITH_SCHEMA_LOCK(session, ret,
		ret = __wt_schema_worker(
		session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)));
	WT_TRET(__wt_meta_track_off(session, false, ret != 0));
	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM checkpoint");

	/* Now the file is written, get the chunk size. */
	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));

	/* Update the flush timestamp to help track ongoing progress. */
	WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts));
	++lsm_tree->chunks_flushed;

	/* Lock the tree, mark the chunk as on disk and update the metadata. */
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;

	/* Update the throttle time. */
	__wt_lsm_tree_throttle(session, lsm_tree, true);
	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM metadata write");

	WT_PUBLISH(chunk->flushing, 0);
	flush_set = false;

	/*
	 * Clear the no-eviction flag so the primary can be evicted and
	 * eventually closed.  Only do this once the checkpoint has succeeded:
	 * otherwise, accessing the leaf page during the checkpoint can trigger
	 * forced eviction.
	 */
	WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
	__wt_btree_evictable(session, true);
	WT_ERR(__wt_session_release_btree(session));

	/* Make sure we aren't pinning a transaction ID. */
	__wt_txn_release_snapshot(session);

	WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
	    chunk->uri));

	/* Schedule a bloom filter create for our newly flushed chunk. */
	if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF))
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
	else
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_MERGE, 0, lsm_tree));

err:	if (flush_set)
		WT_PUBLISH(chunk->flushing, 0);

	return (ret);
}
Ejemplo n.º 20
0
/*
 * __wt_delete_page --
 *	If deleting a range, try to delete the page without instantiating it.
 */
int
__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	*skipp = false;

	/* If we have a clean page in memory, attempt to evict it. */
	if (ref->state == WT_REF_MEM &&
	    __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
		if (__wt_page_is_modified(ref->page)) {
			WT_PUBLISH(ref->state, WT_REF_MEM);
			return (0);
		}

		(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
		ret = __wt_evict_page(session, ref);
		(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
		WT_RET_BUSY_OK(ret);
	}

	/*
	 * Atomically switch the page's state to lock it.  If the page is not
	 * on-disk, other threads may be using it, no fast delete.
	 *
	 * Possible optimization: if the page is already deleted and the delete
	 * is visible to us (the delete has been committed), we could skip the
	 * page instead of instantiating it and figuring out there are no rows
	 * in the page.  While that's a huge amount of work to no purpose, it's
	 * unclear optimizing for overlapping range deletes is worth the effort.
	 */
	if (ref->state != WT_REF_DISK ||
	    !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * We cannot fast-delete pages that have overflow key/value items as
	 * the overflow blocks have to be discarded.  The way we figure that
	 * out is to check the on-page cell type for the page, cells for leaf
	 * pages that have no overflow items are special.
	 *
	 * In some cases, the reference address may not reference an on-page
	 * cell (for example, some combination of page splits), in which case
	 * we can't check the original cell value and we fail.
	 *
	 * To look at an on-page cell, we need to look at the parent page, and
	 * that's dangerous, our parent page could change without warning if
	 * the parent page were to split, deepening the tree.  It's safe: the
	 * page's reference will always point to some valid page, and if we find
	 * any problems we simply fail the fast-delete optimization.
	 *
	 * !!!
	 * I doubt it's worth the effort, but we could copy the cell's type into
	 * the reference structure, and then we wouldn't need an on-page cell.
	 */
	parent = ref->home;
	if (__wt_off_page(parent, ref->addr) ||
	    __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
		goto err;

	/*
	 * This action dirties the parent page: mark it dirty now, there's no
	 * future reconciliation of the child leaf page that will dirty it as
	 * we write the tree.
	 */
	WT_ERR(__wt_page_parent_modify_set(session, ref, false));

	/*
	 * Record the change in the transaction structure and set the change's
	 * transaction ID.
	 */
	WT_ERR(__wt_calloc_one(session, &ref->page_del));
	ref->page_del->txnid = session->txn.id;

	WT_ERR(__wt_txn_modify_ref(session, ref));

	*skipp = true;
	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (0);

err:	__wt_free(session, ref->page_del);

	/*
	 * Restore the page to on-disk status, we'll have to instantiate it.
	 */
	WT_PUBLISH(ref->state, WT_REF_DISK);
	return (ret);
}
Ejemplo n.º 21
0
Archivo: bt_read.c Proyecto: 3rf/mongo
/*
 * __wt_cache_read --
 *	Read a page from the file.
 */
int
__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_DECL_RET;
	WT_ITEM tmp;
	WT_PAGE *page;
	WT_PAGE_STATE previous_state;
	size_t addr_size;
	const uint8_t *addr;

	page = NULL;

	/*
	 * Don't pass an allocated buffer to the underlying block read function,
	 * force allocation of new memory of the appropriate size.
	 */
	WT_CLEAR(tmp);

	/*
	 * Attempt to set the state to WT_REF_READING for normal reads, or
	 * WT_REF_LOCKED, for deleted pages.  If successful, we've won the
	 * race, read the page.
	 */
	if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING))
		previous_state = WT_REF_DISK;
	else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		previous_state = WT_REF_DELETED;
	else
		return (0);

	/*
	 * Get the address: if there is no address, the page was deleted, but a
	 * subsequent search or insert is forcing re-creation of the name space.
	 * Otherwise, there's an address, read the backing disk page and build
	 * an in-memory version of the page.
	 */
	WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
	if (addr == NULL) {
		WT_ASSERT(session, previous_state == WT_REF_DELETED);

		WT_ERR(__wt_btree_new_leaf_page(session, &page));
		ref->page = page;
	} else {
		/* Read the backing disk page. */
		WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));

		/* Build the in-memory version of the page. */
		WT_ERR(__wt_page_inmem(session, ref, tmp.data,
		    WT_DATA_IN_ITEM(&tmp) ?
		    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));

		/* If the page was deleted, instantiate that information. */
		if (previous_state == WT_REF_DELETED)
			WT_ERR(__wt_delete_page_instantiate(session, ref));
	}

	WT_ERR(__wt_verbose(session, WT_VERB_READ,
	    "page %p: %s", page, __wt_page_type_string(page->type)));

	WT_PUBLISH(ref->state, WT_REF_MEM);
	return (0);

err:	/*
	 * If the function building an in-memory version of the page failed,
	 * it discarded the page, but not the disk image.  Discard the page
	 * and separately discard the disk image in all cases.
	 */
	if (ref->page != NULL)
		__wt_ref_out(session, ref);
	WT_PUBLISH(ref->state, previous_state);

	__wt_buf_free(session, &tmp);

	return (ret);
}
Ejemplo n.º 22
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, int syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, leaf_bytes;
	uint64_t internal_pages, leaf_pages;
	uint32_t flags;
	bool evict_reset;

	btree = S2BT(session);

	flags = WT_READ_CACHE | WT_READ_NO_GEN;
	walk = NULL;
	txn = &session->txn;

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
		WT_RET(__wt_epoch(session, &start));

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write the hottest pages: checkpoint will have
			 * to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    __wt_txn_visible_all(
			    session, page->modify->update_txn)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * When internal pages are being reconciled by checkpoint their
		 * child pages cannot disappear from underneath them or be split
		 * into them, nor can underlying blocks be freed until the block
		 * lists for the checkpoint are stable.  Set the checkpointing
		 * flag to block eviction of dirty pages until the checkpoint's
		 * internal page pass is complete, then wait for any existing
		 * eviction to complete.
		 */
		btree->checkpointing = 1;
		WT_FULL_BARRIER();

		WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
		if (evict_reset)
			__wt_evict_file_exclusive_off(session);

		/* Write all dirty in-cache pages. */
		flags |= WT_READ_NO_EVICT;
		for (walk = NULL;;) {
			/*
			 * If we have a page, and it was ever modified, track
			 * the highest transaction ID in the tree.  We do this
			 * here because we want the value after reconciling
			 * dirty pages.
			 */
			if (walk != NULL && walk->page != NULL &&
			    (mod = walk->page->modify) != NULL &&
			    WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn))
				btree->rec_max_txn = mod->rec_max_txn;

			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			page = walk->page;
			mod = page->modify;

			/* Skip clean pages. */
			if (!__wt_page_is_modified(page))
				continue;

			/*
			 * Write dirty pages, unless we can be sure they only
			 * became dirty after the checkpoint started.
			 *
			 * We can skip dirty pages if:
			 * (1) they are leaf pages;
			 * (2) there is a snapshot transaction active (which
			 *     is the case in ordinary application checkpoints
			 *     but not all internal cases); and
			 * (3) the first dirty update on the page is
			 *     sufficiently recent that the checkpoint
			 *     transaction would skip them.
			 *
			 * Mark the tree dirty: the checkpoint marked it clean
			 * and we can't skip future checkpoints until this page
			 * is written.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
			    WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) &&
			    mod->rec_result != WT_PM_REC_REWRITE) {
				__wt_page_modify_set(session, page);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}
			WT_ERR(__wt_reconcile(session, walk, NULL, 0));
		}
		break;
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_ERR(__wt_epoch(session, &end));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of internal\n\t"
		    "Took: %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
		    WT_TIMEDIFF(end, start) / WT_MILLION));
	}

err:	/* On error, clear any left-over tree walk. */
	if (walk != NULL)
		WT_TRET(__wt_page_release(session, walk, flags));

	if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0)
		__wt_txn_release_snapshot(session);

	if (btree->checkpointing) {
		/*
		 * Update the checkpoint generation for this handle so visible
		 * updates newer than the checkpoint can be evicted.
		 *
		 * This has to be published before eviction is enabled again,
		 * so that eviction knows that the checkpoint has completed.
		 */
		WT_PUBLISH(btree->checkpoint_gen,
		    S2C(session)->txn_global.checkpoint_gen);
		WT_STAT_FAST_DATA_SET(session,
		    btree_checkpoint_generation, btree->checkpoint_gen);

		/*
		 * Clear the checkpoint flag and push the change; not required,
		 * but publishing the change means stalled eviction gets moving
		 * as soon as possible.
		 */
		btree->checkpointing = 0;
		WT_FULL_BARRIER();

		/*
		 * If this tree was being skipped by the eviction server during
		 * the checkpoint, clear the wait.
		 */
		btree->evict_walk_period = 0;

		/*
		 * Wake the eviction server, in case application threads have
		 * stalled while the eviction server decided it couldn't make
		 * progress.  Without this, application threads will be stalled
		 * until the eviction server next wakes.
		 */
		WT_TRET(__wt_evict_server_wake(session));
	}

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
		WT_RET(btree->bm->sync(btree->bm, session, true));

	return (ret);
}
Ejemplo n.º 23
0
/*
 * __wt_configure_method --
 *	WT_CONNECTION.configure_method.
 */
int
__wt_configure_method(WT_SESSION_IMPL *session,
    const char *method, const char *uri,
    const char *config, const char *type, const char *check)
{
	const WT_CONFIG_CHECK *cp;
	WT_CONFIG_CHECK *checks, *newcheck;
	const WT_CONFIG_ENTRY **epp;
	WT_CONFIG_ENTRY *entry;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	size_t cnt;
	char *newcheck_name, *p;

	/*
	 * !!!
	 * We ignore the specified uri, that is, all new configuration options
	 * will be valid for all data sources.   That shouldn't be too bad
	 * as the worst that can happen is an application might specify some
	 * configuration option and not get an error -- the option should be
	 * ignored by the underlying implementation since it's unexpected, so
	 * there shouldn't be any real problems.  Eventually I expect we will
	 * get the whole data-source thing sorted, at which time there may be
	 * configuration arrays for each data source, and that's when the uri
	 * will matter.
	 */
	WT_UNUSED(uri);

	conn = S2C(session);
	checks = newcheck = NULL;
	entry = NULL;
	newcheck_name = NULL;

	/* Argument checking; we only support a limited number of types. */
	if (config == NULL)
		WT_RET_MSG(session, EINVAL, "no configuration specified");
	if (type == NULL)
		WT_RET_MSG(session, EINVAL, "no configuration type specified");
	if (strcmp(type, "boolean") != 0 && strcmp(type, "int") != 0 &&
	    strcmp(type, "list") != 0 && strcmp(type, "string") != 0)
		WT_RET_MSG(session, EINVAL,
		    "type must be one of \"boolean\", \"int\", \"list\" or "
		    "\"string\"");

	/* Find a match for the method name. */
	for (epp = conn->config_entries; (*epp)->method != NULL; ++epp)
		if (strcmp((*epp)->method, method) == 0)
			break;
	if ((*epp)->method == NULL)
		WT_RET_MSG(session,
		    WT_NOTFOUND, "no method matching %s found", method);

	/*
	 * Technically possible for threads to race, lock the connection while
	 * adding the new configuration information.  We're holding the lock
	 * for an extended period of time, but configuration changes should be
	 * rare and only happen during startup.
	 */
	__wt_spin_lock(session, &conn->api_lock);

	/*
	 * Allocate new configuration entry and fill it in.
	 *
	 * The new base value is the previous base value, a separator and the
	 * new configuration string.
	 */
	WT_ERR(__wt_calloc_one(session, &entry));
	entry->method = (*epp)->method;
	WT_ERR(__wt_calloc_def(session,
	    strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p));
	(void)strcpy(p, (*epp)->base);
	(void)strcat(p, ",");
	(void)strcat(p, config);
	entry->base = p;

	/*
	 * There may be a default value in the config argument passed in (for
	 * example, (kvs_parallelism=64").  The default value isn't part of the
	 * name, build a new one.
	 */
	WT_ERR(__wt_strdup(session, config, &newcheck_name));
	if ((p = strchr(newcheck_name, '=')) != NULL)
		*p = '\0';

	/*
	 * The new configuration name may replace an existing check with new
	 * information, in that case skip the old version.
	 */
	cnt = 0;
	if ((*epp)->checks != NULL)
		for (cp = (*epp)->checks; cp->name != NULL; ++cp)
			++cnt;
	WT_ERR(__wt_calloc_def(session, cnt + 2, &checks));
	cnt = 0;
	if ((*epp)->checks != NULL)
		for (cp = (*epp)->checks; cp->name != NULL; ++cp)
			if (strcmp(newcheck_name, cp->name) != 0)
				checks[cnt++] = *cp;
	newcheck = &checks[cnt];
	newcheck->name = newcheck_name;
	WT_ERR(__wt_strdup(session, type, &newcheck->type));
	if (check != NULL)
		WT_ERR(__wt_strdup(session, check, &newcheck->checks));
	entry->checks = checks;

	/*
	 * Confirm the configuration string passes the new set of
	 * checks.
	 */
	WT_ERR(config_check(session, entry->checks, config, 0));

	/*
	 * The next time this configuration is updated, we don't want to figure
	 * out which of these pieces of memory were allocated and will need to
	 * be free'd on close (this isn't a heavily used API and it's too much
	 * work); add them all to the free-on-close list now.  We don't check
	 * for errors deliberately, we'd have to figure out which elements have
	 * already been added to the free-on-close array and which have not in
	 * order to avoid freeing chunks of memory twice.  Again, this isn't a
	 * commonly used API and it shouldn't ever happen, just leak it.
	 */
	(void)__conn_foc_add(session, entry->base);
	(void)__conn_foc_add(session, entry);
	(void)__conn_foc_add(session, checks);
	(void)__conn_foc_add(session, newcheck->type);
	(void)__conn_foc_add(session, newcheck->checks);
	(void)__conn_foc_add(session, newcheck_name);

	/*
	 * Instead of using locks to protect configuration information, assume
	 * we can atomically update a pointer to a chunk of memory, and because
	 * a pointer is never partially written, readers will correctly see the
	 * original or new versions of the memory.  Readers might be using the
	 * old version as it's being updated, though, which means we cannot free
	 * the old chunk of memory until all possible readers have finished.
	 * Currently, that's on connection close: in other words, we can use
	 * this because it's small amounts of memory, and we really, really do
	 * not want to acquire locks every time we access configuration strings,
	 * since that's done on every API call.
	 */
	WT_PUBLISH(*epp, entry);

	if (0) {
err:		if (entry != NULL) {
			__wt_free(session, entry->base);
			__wt_free(session, entry);
		}
		__wt_free(session, checks);
		if (newcheck != NULL) {
			__wt_free(session, newcheck->type);
			__wt_free(session, newcheck->checks);
		}
		__wt_free(session, newcheck_name);
	}

	__wt_spin_unlock(session, &conn->api_lock);
	return (ret);
}
Ejemplo n.º 24
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
	uint64_t oldest_id, saved_snap_min;
	uint32_t flags;

	conn = S2C(session);
	btree = S2BT(session);
	walk = NULL;
	txn = &session->txn;
	saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min;
	flags = WT_READ_CACHE | WT_READ_NO_GEN;

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
		WT_RET(__wt_epoch(session, &start));

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		/*
		 * Save the oldest transaction ID we need to keep around.
		 * Otherwise, in a busy system, we could be updating pages so
		 * fast that write leaves never catches up.  We deliberately
		 * have no transaction running at this point that would keep
		 * the oldest ID from moving forwards as we walk the tree.
		 */
		oldest_id = __wt_txn_oldest_id(session);

		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write hot pages (defined as pages that have
			 * been updated since the write phase leaves started):
			 * checkpoint will have to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * If we are flushing a file at read-committed isolation, which
		 * is of particular interest for flushing the metadata to make
		 * schema-changing operation durable, get a transactional
		 * snapshot now.
		 *
		 * All changes committed up to this point should be included.
		 * We don't update the snapshot in between pages because (a)
		 * the metadata shouldn't be that big, and (b) if we do ever
		 */
		if (txn->isolation == WT_ISO_READ_COMMITTED)
			__wt_txn_get_snapshot(session);

		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * In the final checkpoint pass, child pages cannot be evicted
		 * from underneath internal pages nor can underlying blocks be
		 * freed until the checkpoint's block lists are stable. Also,
		 * we cannot split child pages into parents unless we know the
		 * final pass will write a consistent view of that namespace.
		 * Set the checkpointing flag to block such actions and wait for
		 * any problematic eviction or page splits to complete.
		 */
		WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);

		WT_ERR(__wt_evict_file_exclusive_on(session));
		__wt_evict_file_exclusive_off(session);

		WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING);

		/* Write all dirty in-cache pages. */
		flags |= WT_READ_NO_EVICT;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/* Skip clean pages. */
			if (!__wt_page_is_modified(walk->page))
				continue;

			/*
			 * Take a local reference to the page modify structure
			 * now that we know the page is dirty. It needs to be
			 * done in this order otherwise the page modify
			 * structure could have been created between taking the
			 * reference and checking modified.
			 */
			page = walk->page;
			mod = page->modify;

			/*
			 * Write dirty pages, unless we can be sure they only
			 * became dirty after the checkpoint started.
			 *
			 * We can skip dirty pages if:
			 * (1) they are leaf pages;
			 * (2) there is a snapshot transaction active (which
			 *     is the case in ordinary application checkpoints
			 *     but not all internal cases); and
			 * (3) the first dirty update on the page is
			 *     sufficiently recent that the checkpoint
			 *     transaction would skip them.
			 *
			 * Mark the tree dirty: the checkpoint marked it clean
			 * and we can't skip future checkpoints until this page
			 * is written.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
			    WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
				__wt_page_modify_set(session, page);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}
			WT_ERR(__wt_reconcile(session, walk, NULL, 0));
		}
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
	WT_ILLEGAL_VALUE_ERR(session);
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_ERR(__wt_epoch(session, &end));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of internal\n\t"
		    "Took: %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
		    WT_TIMEDIFF_MS(end, start)));
	}

err:	/* On error, clear any left-over tree walk. */
	if (walk != NULL)
		WT_TRET(__wt_page_release(session, walk, flags));

	/*
	 * If we got a snapshot in order to write pages, and there was no
	 * snapshot active when we started, release it.
	 */
	if (txn->isolation == WT_ISO_READ_COMMITTED &&
	    saved_snap_min == WT_TXN_NONE)
		__wt_txn_release_snapshot(session);

	if (btree->checkpointing != WT_CKPT_OFF) {
		/*
		 * Update the checkpoint generation for this handle so visible
		 * updates newer than the checkpoint can be evicted.
		 *
		 * This has to be published before eviction is enabled again,
		 * so that eviction knows that the checkpoint has completed.
		 */
		WT_PUBLISH(btree->checkpoint_gen,
		    conn->txn_global.checkpoint_gen);
		WT_STAT_FAST_DATA_SET(session,
		    btree_checkpoint_generation, btree->checkpoint_gen);

		/*
		 * Clear the checkpoint flag and push the change; not required,
		 * but publishing the change means stalled eviction gets moving
		 * as soon as possible.
		 */
		btree->checkpointing = WT_CKPT_OFF;
		WT_FULL_BARRIER();

		/*
		 * If this tree was being skipped by the eviction server during
		 * the checkpoint, clear the wait.
		 */
		btree->evict_walk_period = 0;

		/*
		 * Wake the eviction server, in case application threads have
		 * stalled while the eviction server decided it couldn't make
		 * progress.  Without this, application threads will be stalled
		 * until the eviction server next wakes.
		 */
		WT_TRET(__wt_evict_server_wake(session));
	}

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 &&
	    syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
		WT_RET(btree->bm->sync(btree->bm, session, true));

	return (ret);
}
Ejemplo n.º 25
0
/*
 * __page_read --
 *	Read a page from the file.
 */
static int
__page_read(WT_SESSION_IMPL *session, WT_REF *ref)
{
	const WT_PAGE_HEADER *dsk;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_ITEM tmp;
	WT_PAGE *page;
	size_t addr_size;
	uint32_t previous_state;
	const uint8_t *addr;

	btree = S2BT(session);
	page = NULL;

	/*
	 * Don't pass an allocated buffer to the underlying block read function,
	 * force allocation of new memory of the appropriate size.
	 */
	WT_CLEAR(tmp);

	/*
	 * Attempt to set the state to WT_REF_READING for normal reads, or
	 * WT_REF_LOCKED, for deleted pages.  If successful, we've won the
	 * race, read the page.
	 */
	if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING))
		previous_state = WT_REF_DISK;
	else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		previous_state = WT_REF_DELETED;
	else
		return (0);

	/*
	 * Get the address: if there is no address, the page was deleted, but a
	 * subsequent search or insert is forcing re-creation of the name space.
	 */
	WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
	if (addr == NULL) {
		WT_ASSERT(session, previous_state == WT_REF_DELETED);

		WT_ERR(__wt_btree_new_leaf_page(session, &page));
		ref->page = page;
		goto done;
	}

	/*
	 * There's an address, read or map the backing disk page and build an
	 * in-memory version of the page.
	 */
	WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
	WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
	    WT_DATA_IN_ITEM(&tmp) ?
	    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));

	/*
	 * Clear the local reference to an allocated copy of the disk image on
	 * return; the page steals it, errors in this code should not free it.
	 */
	tmp.mem = NULL;

	/*
	 * If reading for a checkpoint, there's no additional work to do, the
	 * page on disk is correct as written.
	 */
	if (session->dhandle->checkpoint != NULL)
		goto done;

	/* If the page was deleted, instantiate that information. */
	if (previous_state == WT_REF_DELETED)
		WT_ERR(__wt_delete_page_instantiate(session, ref));

	/*
	 * Instantiate updates from the database's lookaside table. The page
	 * flag was set when the page was written, potentially a long time ago.
	 * We only care if the lookaside table is currently active, check that
	 * before doing any work.
	 */
	dsk = tmp.data;
	if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) {
		WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside);
		WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside);

		WT_ERR(__las_page_instantiate(
		    session, ref, btree->id, addr, addr_size));
	}

done:	WT_PUBLISH(ref->state, WT_REF_MEM);
	return (0);

err:	/*
	 * If the function building an in-memory version of the page failed,
	 * it discarded the page, but not the disk image.  Discard the page
	 * and separately discard the disk image in all cases.
	 */
	if (ref->page != NULL)
		__wt_ref_out(session, ref);
	WT_PUBLISH(ref->state, previous_state);

	__wt_buf_free(session, &tmp);

	return (ret);
}
Ejemplo n.º 26
0
Archivo: txn.c Proyecto: To4e/mongo
/*
 * __wt_txn_release --
 *	Release the resources associated with the current transaction.
 */
void
__wt_txn_release(WT_SESSION_IMPL *session)
{
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *txn_state;
	int was_oldest;

	txn = &session->txn;
	WT_ASSERT(session, txn->mod_count == 0);
	txn->notify = NULL;

	txn_global = &S2C(session)->txn_global;
	txn_state = WT_SESSION_TXN_STATE(session);
	was_oldest = 0;

	/* Clear the transaction's ID from the global table. */
	if (WT_SESSION_IS_CHECKPOINT(session)) {
		WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
		txn->id = WT_TXN_NONE;

		/* Clear the global checkpoint transaction IDs. */
		txn_global->checkpoint_id = 0;
		txn_global->checkpoint_pinned = WT_TXN_NONE;
	} else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
		WT_ASSERT(session,
		    !WT_TXNID_LT(txn->id, txn_global->last_running));

		WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
		    txn->id != WT_TXN_NONE);
		WT_PUBLISH(txn_state->id, WT_TXN_NONE);

		/* Quick check for the oldest transaction. */
		was_oldest = (txn->id == txn_global->last_running);
		txn->id = WT_TXN_NONE;
	}

	/* Free the scratch buffer allocated for logging. */
	__wt_logrec_free(session, &txn->logrec);

	/* Discard any memory from the session's split stash that we can. */
	WT_ASSERT(session, session->split_gen == 0);
	if (session->split_stash_cnt > 0)
		__wt_split_stash_discard(session);

	/*
	 * Reset the transaction state to not running and release the snapshot.
	 */
	__wt_txn_release_snapshot(session);
	txn->isolation = session->isolation;
	/* Ensure the transaction flags are cleared on exit */
	txn->flags = 0;

	/*
	 * When the oldest transaction in the system completes, bump the oldest
	 * ID.  This is racy and so not guaranteed, but in practice it keeps
	 * the oldest ID from falling too far behind.
	 */
	if (was_oldest)
		__wt_txn_update_oldest(session, 1);
}
Ejemplo n.º 27
0
/*
 * __wt_hazard_set --
 *	Set a hazard pointer.
 */
int
__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_HAZARD *hp;
	int restarts = 0;

	btree = S2BT(session);
	conn = S2C(session);
	*busyp = false;

	/* If a file can never be evicted, hazard pointers aren't required. */
	if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
		return (0);

	/*
	 * Do the dance:
	 *
	 * The memory location which makes a page "real" is the WT_REF's state
	 * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the
	 * page eviction server.
	 *
	 * Add the WT_REF reference to the session's hazard list and flush the
	 * write, then see if the page's state is still valid.  If so, we can
	 * use the page because the page eviction server will see our hazard
	 * pointer before it discards the page (the eviction server sets the
	 * state to WT_REF_LOCKED, then flushes memory and checks the hazard
	 * pointers).
	 *
	 * For sessions with many active hazard pointers, skip most of the
	 * active slots: there may be a free slot in there, but checking is
	 * expensive.  Most hazard pointers are released quickly: optimize
	 * for that case.
	 */
	for (hp = session->hazard + session->nhazard;; ++hp) {
		/*
		 * If we get to the end of the array, either:
		 * 1. If we know there are free slots somewhere, and this is
		 *    the first time through, continue the search from the
		 *    start.  Don't actually continue the loop because that
		 *    will skip the first slot.
		 * 2. If we have searched all the way through and we have
		 *    allocated the maximum number of slots, give up.
		 * 3. Allocate another increment of slots, up to the maximum.
		 *    The slot we are on should now be available.
		 */
		if (hp >= session->hazard + session->hazard_size) {
			if (session->nhazard < session->hazard_size &&
			    restarts++ == 0)
				hp = session->hazard;
			else if (session->hazard_size >= conn->hazard_max)
				break;
			else
				WT_PUBLISH(session->hazard_size, WT_MIN(
				    session->hazard_size + WT_HAZARD_INCR,
				    conn->hazard_max));
		}

		if (hp->page != NULL)
			continue;

		hp->page = ref->page;
#ifdef HAVE_DIAGNOSTIC
		hp->file = file;
		hp->line = line;
#endif
		/* Publish the hazard pointer before reading page's state. */
		WT_FULL_BARRIER();

		/*
		 * Check if the page state is still valid, where valid means a
		 * state of WT_REF_MEM and the pointer is unchanged.  (The
		 * pointer can change, it means the page was evicted between
		 * the time we set our hazard pointer and the publication.  It
		 * would theoretically be possible for the page to be evicted
		 * and a different page read into the same memory, so the
		 * pointer hasn't changed but the contents have.  That's OK, we
		 * found this page using the tree's key space, whatever page we
		 * find here is the page for us to use.)
		 */
		if (ref->page == hp->page && ref->state == WT_REF_MEM) {
			++session->nhazard;
			return (0);
		}

		/*
		 * The page isn't available, it's being considered for eviction
		 * (or being evicted, for all we know).  If the eviction server
		 * sees our hazard pointer before evicting the page, it will
		 * return the page to use, no harm done, if it doesn't, it will
		 * go ahead and complete the eviction.
		 *
		 * We don't bother publishing this update: the worst case is we
		 * prevent some random page from being evicted.
		 */
		hp->page = NULL;
		*busyp = true;
		return (0);
	}

	__wt_errx(session,
	    "session %p: hazard pointer table full", (void *)session);
#ifdef HAVE_DIAGNOSTIC
	__hazard_dump(session);
#endif

	return (ENOMEM);
}
Ejemplo n.º 28
0
/*
 * __wt_delete_page --
 *	If deleting a range, try to delete the page without instantiating it.
 */
int
__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	*skipp = false;

	/* If we have a clean page in memory, attempt to evict it. */
	if (ref->state == WT_REF_MEM &&
	    __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
		if (__wt_page_is_modified(ref->page)) {
			WT_PUBLISH(ref->state, WT_REF_MEM);
			return (0);
		}

		(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
		ret = __wt_evict(session, ref, false);
		(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
		WT_RET_BUSY_OK(ret);
	}

	/*
	 * Atomically switch the page's state to lock it.  If the page is not
	 * on-disk, other threads may be using it, no fast delete.
	 */
	if (ref->state != WT_REF_DISK ||
	    !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * We cannot fast-delete pages that have overflow key/value items as
	 * the overflow blocks have to be discarded.  The way we figure that
	 * out is to check the page's cell type, cells for leaf pages without
	 * overflow items are special.
	 *
	 * To look at an on-page cell, we need to look at the parent page, and
	 * that's dangerous, our parent page could change without warning if
	 * the parent page were to split, deepening the tree.  It's safe: the
	 * page's reference will always point to some valid page, and if we find
	 * any problems we simply fail the fast-delete optimization.
	 */
	parent = ref->home;
	if (__wt_off_page(parent, ref->addr) ?
	    ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO :
	    __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
		goto err;

	/*
	 * This action dirties the parent page: mark it dirty now, there's no
	 * future reconciliation of the child leaf page that will dirty it as
	 * we write the tree.
	 */
	WT_ERR(__wt_page_parent_modify_set(session, ref, false));

	/*
	 * Record the change in the transaction structure and set the change's
	 * transaction ID.
	 */
	WT_ERR(__wt_calloc_one(session, &ref->page_del));
	ref->page_del->txnid = session->txn.id;

	WT_ERR(__wt_txn_modify_ref(session, ref));

	*skipp = true;
	WT_STAT_CONN_INCR(session, rec_page_delete_fast);
	WT_STAT_DATA_INCR(session, rec_page_delete_fast);
	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (0);

err:	__wt_free(session, ref->page_del);

	/*
	 * Restore the page to on-disk status, we'll have to instantiate it.
	 */
	WT_PUBLISH(ref->state, WT_REF_DISK);
	return (ret);
}
Ejemplo n.º 29
0
/*
 * __wt_col_append_serial_func --
 *	Server function to append an WT_INSERT entry to the tree.
 */
int
__wt_col_append_serial_func(WT_SESSION_IMPL *session, void *args)
{
	WT_BTREE *btree;
	WT_INSERT *ins, *new_ins, ***ins_stack, **next_stack;
	WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead;
	WT_PAGE *page;
	uint64_t recno;
	uint32_t write_gen;
	u_int i, skipdepth;

	btree = S2BT(session);

	__wt_col_append_unpack(args,
	    &page, &write_gen, &insheadp, &ins_stack, &next_stack,
	    &new_inslist, &new_inshead, &new_ins, &skipdepth);

	/* Check the page's write-generation. */
	WT_RET(__wt_page_write_gen_check(session, page, write_gen));

	if ((inshead = *insheadp) == NULL)
		inshead = new_inshead;

	/*
	 * If the application specified a record number, there's a race: the
	 * application may have searched for the record, not found it, then
	 * called into the append code, and another thread might have added
	 * the record.  Fortunately, we're in the right place because if the
	 * record didn't exist at some point, it can only have been created
	 * on this list.  Search for the record, if specified.
	 */
	if ((recno = WT_INSERT_RECNO(new_ins)) == 0)
		recno = WT_INSERT_RECNO(new_ins) = ++btree->last_recno;

	ins = __col_insert_search(inshead, ins_stack, next_stack, recno);

	/* If we find the record number, there's been a race. */
	if (ins != NULL && WT_INSERT_RECNO(ins) == recno)
		WT_RET(WT_RESTART);

	/*
	 * Publish: First, point the new WT_INSERT item's skiplist references
	 * to the next elements in the insert list, then flush memory.  Second,
	 * update the skiplist elements that reference the new WT_INSERT item,
	 * this ensures the list is never inconsistent.
	 */
	for (i = 0; i < skipdepth; i++)
		new_ins->next[i] = *ins_stack[i];
	WT_WRITE_BARRIER();
	for (i = 0; i < skipdepth; i++) {
		if (inshead->tail[i] == NULL ||
		    ins_stack[i] == &inshead->tail[i]->next[i])
			inshead->tail[i] = new_ins;
		*ins_stack[i] = new_ins;
	}

	__wt_col_append_new_ins_taken(args);

	/*
	 * If the insert head does not yet have an insert list, our caller
	 * passed us one.
	 *
	 * NOTE: it is important to do this after the item has been added to
	 * the list.  Code can assume that if the list is set, it is non-empty.
	 */
	if (*insheadp == NULL) {
		WT_PUBLISH(*insheadp, new_inshead);
		__wt_col_append_new_inshead_taken(args);
	}

	/*
	 * If the page does not yet have an insert array, our caller passed
	 * us one.
	 *
	 * NOTE: it is important to do this after publishing the list entry.
	 * Code can assume that if the array is set, it is non-empty.
	 */
	if (page->modify->append == NULL) {
		page->modify->append = new_inslist;
		__wt_col_append_new_inslist_taken(args);
	}

	/*
	 * If we don't find the record, check to see if we extended the file,
	 * and update the last record number.
	 */
	if (recno > btree->last_recno)
		btree->last_recno = recno;

	__wt_page_and_tree_modify_set(session, page);
	return (0);
}
Ejemplo n.º 30
0
/*
 * __rec_page_dirty_update --
 *	Update a dirty page's reference on eviction.
 */
static int
__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_ADDR *addr;
	WT_PAGE_MODIFY *mod;
	WT_REF *parent_ref;

	mod = page->modify;
	parent_ref = page->ref;

	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
	case WT_PM_REC_EMPTY:				/* Page is empty */
		if (parent_ref->addr != NULL &&
		    __wt_off_page(page->parent, parent_ref->addr)) {
			__wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr);
			__wt_free(session, parent_ref->addr);
		}

		/*
		 * Update the parent to reference an empty page.
		 *
		 * Set the transaction ID to WT_TXN_NONE because the fact that
		 * reconciliation left the page "empty" means there's no older
		 * transaction in the system that might need to see an earlier
		 * version of the page.  It isn't necessary (WT_TXN_NONE is 0),
		 * but it's the right thing to do.
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		parent_ref->page = NULL;
		parent_ref->addr = NULL;
		parent_ref->txnid = WT_TXN_NONE;
		WT_PUBLISH(parent_ref->state, WT_REF_DELETED);
		break;
	case WT_PM_REC_REPLACE: 			/* 1-for-1 page swap */
		if (parent_ref->addr != NULL &&
		    __wt_off_page(page->parent, parent_ref->addr)) {
			__wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr);
			__wt_free(session, parent_ref->addr);
		}

		/*
		 * Update the parent to reference the replacement page.
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
		*addr = mod->u.replace;
		mod->u.replace.addr = NULL;
		mod->u.replace.size = 0;

		parent_ref->page = NULL;
		parent_ref->addr = addr;
		WT_PUBLISH(parent_ref->state, WT_REF_DISK);
		break;
	case WT_PM_REC_SPLIT:				/* Page split */
		/*
		 * Update the parent to reference new internal page(s).
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		parent_ref->page = mod->u.split;
		WT_PUBLISH(parent_ref->state, WT_REF_MEM);

		/* Clear the reference else discarding the page will free it. */
		mod->u.split = NULL;
		F_CLR(mod, WT_PM_REC_SPLIT);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	return (0);
}