コード例 #1
0
ファイル: bt_read.c プロジェクト: doubaokun/wiredtiger
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
                  , const char *file, int line
#endif
                 )
{
    WT_BTREE *btree;
    WT_DECL_RET;
    WT_PAGE *page;
    u_int sleep_cnt, wait_cnt;
    bool busy, cache_work, oldgen, stalled;
    int force_attempts;

    btree = S2BT(session);

    for (oldgen = stalled = false,
            force_attempts = 0, sleep_cnt = wait_cnt = 0;;) {
        switch (ref->state) {
        case WT_REF_DELETED:
            if (LF_ISSET(WT_READ_NO_EMPTY) &&
                    __wt_delete_page_skip(session, ref, false))
                return (WT_NOTFOUND);
        /* FALLTHROUGH */
        case WT_REF_DISK:
            if (LF_ISSET(WT_READ_CACHE))
                return (WT_NOTFOUND);

            /*
             * The page isn't in memory, read it. If this thread is
             * allowed to do eviction work, check for space in the
             * cache.
             */
            if (!LF_ISSET(WT_READ_NO_EVICT))
                WT_RET(__wt_cache_eviction_check(
                           session, 1, NULL));
            WT_RET(__page_read(session, ref));
            oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
                     F_ISSET(session, WT_SESSION_NO_CACHE);
            continue;
        case WT_REF_READING:
            if (LF_ISSET(WT_READ_CACHE))
                return (WT_NOTFOUND);
            if (LF_ISSET(WT_READ_NO_WAIT))
                return (WT_NOTFOUND);

            /* Waiting on another thread's read, stall. */
            WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
            stalled = true;
            break;
        case WT_REF_LOCKED:
            if (LF_ISSET(WT_READ_NO_WAIT))
                return (WT_NOTFOUND);

            /* Waiting on eviction, stall. */
            WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
            stalled = true;
            break;
        case WT_REF_SPLIT:
            return (WT_RESTART);
        case WT_REF_MEM:
            /*
             * The page is in memory.
             *
             * Get a hazard pointer if one is required. We cannot
             * be evicting if no hazard pointer is required, we're
             * done.
             */
            if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
                goto skip_evict;

            /*
             * The expected reason we can't get a hazard pointer is
             * because the page is being evicted, yield, try again.
             */
#ifdef HAVE_DIAGNOSTIC
            WT_RET(
                __wt_hazard_set(session, ref, &busy, file, line));
#else
            WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
            if (busy) {
                WT_STAT_FAST_CONN_INCR(
                    session, page_busy_blocked);
                break;
            }

            /*
             * If eviction is configured for this file, check to see
             * if the page qualifies for forced eviction and update
             * the page's generation number. If eviction isn't being
             * done on this file, we're done.
             */
            if (LF_ISSET(WT_READ_NO_EVICT) ||
                    F_ISSET(session, WT_SESSION_NO_EVICTION) ||
                    F_ISSET(btree, WT_BTREE_NO_EVICTION))
                goto skip_evict;

            /*
             * Forcibly evict pages that are too big.
             */
            if (force_attempts < 10 &&
                    __evict_force_check(session, ref)) {
                ++force_attempts;
                ret = __wt_page_release_evict(session, ref);
                /* If forced eviction fails, stall. */
                if (ret == EBUSY) {
                    ret = 0;
                    WT_STAT_FAST_CONN_INCR(session,
                                           page_forcible_evict_blocked);
                    stalled = true;
                    break;
                }
                WT_RET(ret);

                /*
                 * The result of a successful forced eviction
                 * is a page-state transition (potentially to
                 * an in-memory page we can use, or a restart
                 * return for our caller), continue the outer
                 * page-acquisition loop.
                 */
                continue;
            }

            /*
             * If we read the page and we are configured to not
             * trash the cache, set the oldest read generation so
             * the page is forcibly evicted as soon as possible.
             *
             * Otherwise, update the page's read generation.
             */
            page = ref->page;
            if (oldgen && page->read_gen == WT_READGEN_NOTSET)
                __wt_page_evict_soon(page);
            else if (!LF_ISSET(WT_READ_NO_GEN) &&
                     page->read_gen != WT_READGEN_OLDEST &&
                     page->read_gen < __wt_cache_read_gen(session))
                page->read_gen =
                    __wt_cache_read_gen_bump(session);
skip_evict:
            /*
             * Check if we need an autocommit transaction.
             * Starting a transaction can trigger eviction, so skip
             * it if eviction isn't permitted.
             */
            return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
                    __wt_txn_autocommit_check(session));
            WT_ILLEGAL_VALUE(session);
        }

        /*
         * We failed to get the page -- yield before retrying, and if
         * we've yielded enough times, start sleeping so we don't burn
         * CPU to no purpose.
         */
        if (stalled)
            wait_cnt += WT_THOUSAND;
        else if (++wait_cnt < WT_THOUSAND) {
            __wt_yield();
            continue;
        }

        /*
         * If stalling and this thread is allowed to do eviction work,
         * check if the cache needs help. If we do work for the cache,
         * substitute that for a sleep.
         */
        if (!LF_ISSET(WT_READ_NO_EVICT)) {
            WT_RET(
                __wt_cache_eviction_check(session, 1, &cache_work));
            if (cache_work)
                continue;
        }
        sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000);
        WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
        __wt_sleep(0, sleep_cnt);
    }
}
コード例 #2
0
ファイル: bt_io.c プロジェクト: ezhangle/node-wiredtigerdown
/*
 * __wt_bt_write --
 *	Write a buffer into a block, returning the block's addr/size and
 * checksum.
 */
int
__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
    uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_ITEM *ip;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_PAGE_HEADER *dsk;
	size_t len, src_len, dst_len, result_len, size;
	int data_cksum, compression_failed;
	uint8_t *src, *dst;

	btree = S2BT(session);
	bm = btree->bm;

	/* Checkpoint calls are different than standard calls. */
	WT_ASSERT(session,
	    (checkpoint == 0 && addr != NULL && addr_sizep != NULL) ||
	    (checkpoint == 1 && addr == NULL && addr_sizep == NULL));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * We're passed a table's disk image.  Decompress if necessary and
	 * verify the image.  Always check the in-memory length for accuracy.
	 */
	dsk = buf->mem;
	if (compressed) {
		WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp));

		memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP,
		    buf->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
		    tmp->memsize - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));
		WT_ASSERT(session,
		    dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP);
		tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
		ip = tmp;
	} else {
		WT_ASSERT(session, dsk->mem_size == buf->size);
		ip = buf;
	}
	WT_ERR(__wt_verify_dsk(session, "[write-check]", ip));
	__wt_scr_free(&tmp);
#endif

	/*
	 * Optionally stream-compress the data, but don't compress blocks that
	 * are already as small as they're going to get.
	 */
	if (btree->compressor == NULL ||
	    btree->compressor->compress == NULL || compressed)
		ip = buf;
	else if (buf->size <= btree->allocsize) {
		ip = buf;
		WT_STAT_FAST_DATA_INCR(session, compress_write_too_small);
	} else {
		/* Skip the header bytes of the source data. */
		src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
		src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;

		/*
		 * Compute the size needed for the destination buffer.  We only
		 * allocate enough memory for a copy of the original by default,
		 * if any compressed version is bigger than the original, we
		 * won't use it.  However, some compression engines (snappy is
		 * one example), may need more memory because they don't stop
		 * just because there's no more memory into which to compress.
		 */
		if (btree->compressor->pre_size == NULL)
			len = src_len;
		else
			WT_ERR(btree->compressor->pre_size(btree->compressor,
			    &session->iface, src, src_len, &len));

		size = len + WT_BLOCK_COMPRESS_SKIP;
		WT_ERR(bm->write_size(bm, session, &size));
		WT_ERR(__wt_scr_alloc(session, size, &tmp));

		/* Skip the header bytes of the destination data. */
		dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP;
		dst_len = len;

		compression_failed = 0;
		WT_ERR(btree->compressor->compress(btree->compressor,
		    &session->iface,
		    src, src_len,
		    dst, dst_len,
		    &result_len, &compression_failed));
		result_len += WT_BLOCK_COMPRESS_SKIP;

		/*
		 * If compression fails, or doesn't gain us at least one unit of
		 * allocation, fallback to the original version.  This isn't
		 * unexpected: if compression doesn't work for some chunk of
		 * data for some reason (noting likely additional format/header
		 * information which compressed output requires), it just means
		 * the uncompressed version is as good as it gets, and that's
		 * what we use.
		 */
		if (compression_failed ||
		    buf->size / btree->allocsize ==
		    result_len / btree->allocsize) {
			ip = buf;
			WT_STAT_FAST_DATA_INCR(session, compress_write_fail);
		} else {
			compressed = 1;
			WT_STAT_FAST_DATA_INCR(session, compress_write);

			/*
			 * Copy in the skipped header bytes, set the final data
			 * size.
			 */
			memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
			tmp->size = result_len;
			ip = tmp;
		}
	}
	dsk = ip->mem;

	/* If the buffer is compressed, set the flag. */
	if (compressed)
		F_SET(dsk, WT_PAGE_COMPRESSED);

	/*
	 * We increment the block's write generation so it's easy to identify
	 * newer versions of blocks during salvage.  (It's common in WiredTiger,
	 * at least for the default block manager, for multiple blocks to be
	 * internally consistent with identical first and last keys, so we need
	 * a way to know the most recent state of the block.  We could check
	 * which leaf is referenced by a valid internal page, but that implies
	 * salvaging internal pages, which I don't want to do, and it's not
	 * as good anyway, because the internal page may not have been written
	 * after the leaf page was updated.  So, write generations it is.
	 *
	 * Nothing is locked at this point but two versions of a page with the
	 * same generation is pretty unlikely, and if we did, they're going to
	 * be roughly identical for the purposes of salvage, anyway.
	 */
	dsk->write_gen = ++btree->write_gen;

	/*
	 * Checksum the data if the buffer isn't compressed or checksums are
	 * configured.
	 */
	switch (btree->checksum) {
	case CKSUM_ON:
		data_cksum = 1;
		break;
	case CKSUM_OFF:
		data_cksum = 0;
		break;
	case CKSUM_UNCOMPRESSED:
	default:
		data_cksum = !compressed;
		break;
	}

	/* Call the block manager to write the block. */
	WT_ERR(checkpoint ?
	    bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) :
	    bm->write(bm, session, ip, addr, addr_sizep, data_cksum));

	WT_STAT_FAST_CONN_INCR(session, cache_write);
	WT_STAT_FAST_DATA_INCR(session, cache_write);
	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, ip->size);
	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, ip->size);

err:	__wt_scr_free(&tmp);
	return (ret);
}
コード例 #3
0
ファイル: conn_log.c プロジェクト: judahschvimer/mongo
/*
 * __wt_log_wrlsn --
 *	Process written log slots and attempt to coalesce them if the LSNs
 *	are contiguous.  The purpose of this function is to advance the
 *	write_lsn in LSN order after the buffer is written to the log file.
 */
void
__wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL];
	WT_LOGSLOT *coalescing, *slot;
	WT_LSN save_lsn;
	size_t written_i;
	uint32_t i, save_i;

	conn = S2C(session);
	log = conn->log;
	__wt_spin_lock(session, &log->log_writelsn_lock);
restart:
	coalescing = NULL;
	WT_INIT_LSN(&save_lsn);
	written_i = 0;
	i = 0;

	/*
	 * Walk the array once saving any slots that are in the
	 * WT_LOG_SLOT_WRITTEN state.
	 */
	while (i < WT_SLOT_POOL) {
		save_i = i;
		slot = &log->slot_pool[i++];
		if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
			continue;
		written[written_i].slot_index = save_i;
		written[written_i++].lsn = slot->slot_release_lsn;
	}
	/*
	 * If we found any written slots process them.  We sort them
	 * based on the release LSN, and then look for them in order.
	 */
	if (written_i > 0) {
		if (yield != NULL)
			*yield = 0;
		WT_INSERTION_SORT(written, written_i,
		    WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT);
		/*
		 * We know the written array is sorted by LSN.  Go
		 * through them either advancing write_lsn or coalesce
		 * contiguous ranges of written slots.
		 */
		for (i = 0; i < written_i; i++) {
			slot = &log->slot_pool[written[i].slot_index];
			/*
			 * The log server thread pushes out slots periodically.
			 * Sometimes they are empty slots.  If we find an
			 * empty slot, where empty means the start and end LSN
			 * are the same, free it and continue.
			 */
			if (__wt_log_cmp(&slot->slot_start_lsn,
			    &slot->slot_release_lsn) == 0 &&
			    __wt_log_cmp(&slot->slot_start_lsn,
			    &slot->slot_end_lsn) == 0) {
				__wt_log_slot_free(session, slot);
				continue;
			}
			if (coalescing != NULL) {
				/*
				 * If the write_lsn changed, we may be able to
				 * process slots.  Try again.
				 */
				if (__wt_log_cmp(
				    &log->write_lsn, &save_lsn) != 0)
					goto restart;
				if (__wt_log_cmp(&coalescing->slot_end_lsn,
				    &written[i].lsn) != 0) {
					coalescing = slot;
					continue;
				}
				/*
				 * If we get here we have a slot to coalesce
				 * and free.
				 */
				coalescing->slot_last_offset =
				    slot->slot_last_offset;
				coalescing->slot_end_lsn = slot->slot_end_lsn;
				WT_STAT_FAST_CONN_INCR(
				    session, log_slot_coalesced);
				/*
				 * Copy the flag for later closing.
				 */
				if (F_ISSET(slot, WT_SLOT_CLOSEFH))
					F_SET(coalescing, WT_SLOT_CLOSEFH);
			} else {
				/*
				 * If this written slot is not the next LSN,
				 * try to start coalescing with later slots.
				 * A synchronous write may update write_lsn
				 * so save the last one we saw to check when
				 * coalescing slots.
				 */
				save_lsn = log->write_lsn;
				if (__wt_log_cmp(
				    &log->write_lsn, &written[i].lsn) != 0) {
					coalescing = slot;
					continue;
				}
				/*
				 * If we get here we have a slot to process.
				 * Advance the LSN and process the slot.
				 */
				WT_ASSERT(session, __wt_log_cmp(&written[i].lsn,
				    &slot->slot_release_lsn) == 0);
				/*
				 * We need to maintain the starting offset of
				 * a log record so that the checkpoint LSN
				 * refers to the beginning of a real record.
				 * The last offset in a slot is kept so that
				 * the checkpoint LSN is close to the end of
				 * the record.
				 */
				if (slot->slot_start_lsn.l.offset !=
				    slot->slot_last_offset)
					slot->slot_start_lsn.l.offset =
					    (uint32_t)slot->slot_last_offset;
				log->write_start_lsn = slot->slot_start_lsn;
				log->write_lsn = slot->slot_end_lsn;
				__wt_cond_signal(session, log->log_write_cond);
				WT_STAT_FAST_CONN_INCR(session, log_write_lsn);
				/*
				 * Signal the close thread if needed.
				 */
				if (F_ISSET(slot, WT_SLOT_CLOSEFH))
					__wt_cond_signal(
					    session, conn->log_file_cond);
			}
			__wt_log_slot_free(session, slot);
		}
	}
	__wt_spin_unlock(session, &log->log_writelsn_lock);
}
コード例 #4
0
ファイル: os_alloc.c プロジェクト: RolfAndreassen/wiredtiger
/*
 * __wt_realloc_aligned --
 *	ANSI realloc function that aligns to buffer boundaries, configured with
 *	the "buffer_alignment" key to wiredtiger_open.
 */
int
__wt_realloc_aligned(WT_SESSION_IMPL *session,
                     size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
{
#if defined(HAVE_POSIX_MEMALIGN)
    WT_DECL_RET;

    /*
     * !!!
     * This function MUST handle a NULL WT_SESSION_IMPL handle.
     */
    if (session != NULL && S2C(session)->buffer_alignment > 0) {
        void *p, *newp;
        size_t bytes_allocated;

        /*
         * Sometimes we're allocating memory and we don't care about the
         * final length -- bytes_allocated_ret may be NULL.
         */
        p = *(void **)retp;
        bytes_allocated =
            (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
        WT_ASSERT(session,
                  (p == NULL && bytes_allocated == 0) ||
                  (p != NULL &&
                   (bytes_allocated_ret == NULL || bytes_allocated != 0)));
        WT_ASSERT(session, bytes_to_allocate != 0);
        WT_ASSERT(session, bytes_allocated < bytes_to_allocate);

        if (session != NULL)
            WT_STAT_FAST_CONN_INCR(session, memory_allocation);

        if ((ret = posix_memalign(&newp,
                                  S2C(session)->buffer_alignment,
                                  bytes_to_allocate)) != 0)
            WT_RET_MSG(session, ret, "memory allocation");

        if (p != NULL)
            memcpy(newp, p, bytes_allocated);
        __wt_free(session, p);
        p = newp;

        /* Clear the allocated memory (see above). */
        memset((uint8_t *)p + bytes_allocated, 0,
               bytes_to_allocate - bytes_allocated);

        /* Update caller's bytes allocated value. */
        if (bytes_allocated_ret != NULL)
            *bytes_allocated_ret = bytes_to_allocate;

        *(void **)retp = p;
        return (0);
    }
#endif
    /*
     * If there is no posix_memalign function, or no alignment configured,
     * fall back to realloc.
     */
    return (__wt_realloc(
                session, bytes_allocated_ret, bytes_to_allocate, retp));
}
コード例 #5
0
ファイル: bt_cursor.c プロジェクト: radik/mongo
/*
 * __wt_btcur_insert --
 *	Insert a record into the tree.
 */
int
__wt_btcur_insert(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_FAST_CONN_INCR(session, cursor_insert);
	WT_STAT_FAST_DATA_INCR(session, cursor_insert);
	WT_STAT_FAST_DATA_INCRV(session,
	    cursor_insert_bytes, cursor->key.size + cursor->value.size);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));
	WT_RET(__cursor_size_chk(session, &cursor->value));

	/*
	 * The tree is no longer empty: eviction should pay attention to it,
	 * and it's no longer possible to bulk-load into it.
	 */
	if (btree->bulk_load_ok) {
		btree->bulk_load_ok = 0;
		__wt_btree_evictable(session, 1);
	}

retry:	WT_RET(__cursor_func_init(cbt, 1));

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		/*
		 * If WT_CURSTD_APPEND is set, insert a new record (ignoring
		 * the application's record number).  First we search for the
		 * maximum possible record number so the search ends on the
		 * last page.  The real record number is assigned by the
		 * serialized append operation.
		 */
		if (F_ISSET(cursor, WT_CURSTD_APPEND))
			cbt->iface.recno = UINT64_MAX;

		WT_ERR(__cursor_col_search(session, cbt, NULL));

		if (F_ISSET(cursor, WT_CURSTD_APPEND))
			cbt->iface.recno = 0;

		/*
		 * If not overwriting, fail if the key exists.  Creating a
		 * record past the end of the tree in a fixed-length
		 * column-store implicitly fills the gap with empty records.
		 * Fail in that case, the record exists.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
		    ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) ||
		    (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
			WT_ERR(WT_DUPLICATE_KEY);

		WT_ERR(__cursor_col_modify(session, cbt, 0));
		if (F_ISSET(cursor, WT_CURSTD_APPEND))
			cbt->iface.recno = cbt->recno;
		break;
	case BTREE_ROW:
		WT_ERR(__cursor_row_search(session, cbt, NULL, 1));
		/*
		 * If not overwriting, fail if the key exists, else insert the
		 * key/value pair.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
		    cbt->compare == 0 && __cursor_valid(cbt, NULL))
			WT_ERR(WT_DUPLICATE_KEY);

		ret = __cursor_row_modify(session, cbt, 0);
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

err:	if (ret == WT_RESTART) {
		WT_STAT_FAST_CONN_INCR(session, cursor_restart);
		WT_STAT_FAST_DATA_INCR(session, cursor_restart);
		goto retry;
	}
	/* Insert doesn't maintain a position across calls, clear resources. */
	if (ret == 0)
		WT_TRET(__curfile_leave(cbt));
	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
コード例 #6
0
ファイル: bt_cursor.c プロジェクト: radik/mongo
/*
 * __wt_btcur_update --
 *	Update a record in the tree.
 */
int
__wt_btcur_update(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_FAST_CONN_INCR(session, cursor_update);
	WT_STAT_FAST_DATA_INCR(session, cursor_update);
	WT_STAT_FAST_DATA_INCRV(
	    session, cursor_update_bytes, cursor->value.size);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));
	WT_RET(__cursor_size_chk(session, &cursor->value));

	/*
	 * The tree is no longer empty: eviction should pay attention to it,
	 * and it's no longer possible to bulk-load into it.
	 */
	if (btree->bulk_load_ok) {
		btree->bulk_load_ok = 0;
		__wt_btree_evictable(session, 1);
	}

retry:	WT_RET(__cursor_func_init(cbt, 1));

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		WT_ERR(__cursor_col_search(session, cbt, NULL));

		/*
		 * If not overwriting, fail if the key doesn't exist.  If we
		 * find an update for the key, check for conflicts.  Update the
		 * record if it exists.  Creating a record past the end of the
		 * tree in a fixed-length column-store implicitly fills the gap
		 * with empty records.  Update the record in that case, the
		 * record exists.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
			WT_ERR(__curfile_update_check(cbt));
			if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) &&
			    !__cursor_fix_implicit(btree, cbt))
				WT_ERR(WT_NOTFOUND);
		}
		ret = __cursor_col_modify(session, cbt, 0);
		break;
	case BTREE_ROW:
		WT_ERR(__cursor_row_search(session, cbt, NULL, 1));
		/*
		 * If not overwriting, check for conflicts and fail if the key
		 * does not exist.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
			WT_ERR(__curfile_update_check(cbt));
			if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
				WT_ERR(WT_NOTFOUND);
		}
		ret = __cursor_row_modify(session, cbt, 0);
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

err:	if (ret == WT_RESTART) {
		WT_STAT_FAST_CONN_INCR(session, cursor_restart);
		WT_STAT_FAST_DATA_INCR(session, cursor_restart);
		goto retry;
	}

	/*
	 * If successful, point the cursor at internal copies of the data.  We
	 * could shuffle memory in the cursor so the key/value pair are in local
	 * buffer memory, but that's a data copy.  We don't want to do another
	 * search (and we might get a different update structure if we race).
	 * To make this work, we add a field to the btree cursor to pass back a
	 * pointer to the modify function's allocated update structure.
	 */
	if (ret == 0)
		WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update));

	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
コード例 #7
0
ファイル: bt_cursor.c プロジェクト: rueckstiess/mongo
/*
 * __wt_btcur_search_near --
 *	Search for a record in the tree.
 */
int
__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	int exact;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	exact = 0;

	WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
	WT_STAT_FAST_DATA_INCR(session, cursor_search_near);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));

	WT_RET(__cursor_func_init(cbt, 1));

	/*
	 * Set the "insert" flag for the btree row-store search; we may intend
	 * to position our cursor at the end of the tree, rather than match an
	 * existing record.
	 */
	WT_ERR(btree->type == BTREE_ROW ?
	    __cursor_row_search(session, cbt, 1) :
	    __cursor_col_search(session, cbt));

	/*
	 * If we find an valid key, return it.
	 *
	 * Else, creating a record past the end of the tree in a fixed-length
	 * column-store implicitly fills the gap with empty records.  In this
	 * case, we instantiate the empty record, it's an exact match.
	 *
	 * Else, move to the next key in the tree (bias for prefix searches).
	 * Cursor next skips invalid rows, so we don't have to test for them
	 * again.
	 *
	 * Else, redo the search and move to the previous key in the tree.
	 * Cursor previous skips invalid rows, so we don't have to test for
	 * them again.
	 *
	 * If that fails, quit, there's no record to return.
	 */
	if (__cursor_valid(cbt, &upd)) {
		exact = cbt->compare;
		ret = __wt_kv_return(session, cbt, upd);
	} else if (__cursor_fix_implicit(btree, cbt)) {
		cbt->recno = cursor->recno;
		cbt->v = 0;
		cursor->value.data = &cbt->v;
		cursor->value.size = 1;
		exact = 0;
	} else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND)
		exact = 1;
	else {
		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, 1) :
		    __cursor_col_search(session, cbt));
		if (__cursor_valid(cbt, &upd)) {
			exact = cbt->compare;
			ret = __wt_kv_return(session, cbt, upd);
		} else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND)
			exact = -1;
	}

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND))
		*exactp = exact;
	return (ret);
}
コード例 #8
0
ファイル: bt_cursor.c プロジェクト: radik/mongo
/*
 * __wt_btcur_search --
 *	Search for a matching record in the tree.
 */
int
__wt_btcur_search(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	int valid;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	upd = NULL;					/* -Wuninitialized */

	WT_STAT_FAST_CONN_INCR(session, cursor_search);
	WT_STAT_FAST_DATA_INCR(session, cursor_search);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));

	/*
	 * If we have a page pinned, search it; if we don't have a page pinned,
	 * or the search of the pinned page doesn't find an exact match, search
	 * from the root.
	 */
	valid = 0;
	if (F_ISSET(cbt, WT_CBT_ACTIVE) &&
	    cbt->ref->page->read_gen != WT_READGEN_OLDEST) {
		__wt_txn_cursor_op(session);

		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, cbt->ref, 0) :
		    __cursor_col_search(session, cbt, cbt->ref));
		valid = cbt->compare == 0 && __cursor_valid(cbt, &upd);
	}
	if (!valid) {
		WT_ERR(__cursor_func_init(cbt, 1));

		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, 0) :
		    __cursor_col_search(session, cbt, NULL));
		valid = cbt->compare == 0 && __cursor_valid(cbt, &upd);
	}

	if (valid)
		ret = __wt_kv_return(session, cbt, upd);
	else if (__cursor_fix_implicit(btree, cbt)) {
		/*
		 * Creating a record past the end of the tree in a fixed-length
		 * column-store implicitly fills the gap with empty records.
		 */
		cbt->recno = cursor->recno;
		cbt->v = 0;
		cursor->value.data = &cbt->v;
		cursor->value.size = 1;
	} else
		ret = WT_NOTFOUND;

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
コード例 #9
0
ファイル: bt_delete.c プロジェクト: Zhangwusheng/wiredtiger
/*
 * __wt_delete_page --
 *	If deleting a range, try to delete the page without instantiating it.
 */
int
__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	*skipp = false;

	/* If we have a clean page in memory, attempt to evict it. */
	if (ref->state == WT_REF_MEM &&
	    __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
		if (__wt_page_is_modified(ref->page)) {
			WT_PUBLISH(ref->state, WT_REF_MEM);
			return (0);
		}

		(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
		ret = __wt_evict(session, ref, false);
		(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
		WT_RET_BUSY_OK(ret);
	}

	/*
	 * Atomically switch the page's state to lock it.  If the page is not
	 * on-disk, other threads may be using it, no fast delete.
	 *
	 * Possible optimization: if the page is already deleted and the delete
	 * is visible to us (the delete has been committed), we could skip the
	 * page instead of instantiating it and figuring out there are no rows
	 * in the page.  While that's a huge amount of work to no purpose, it's
	 * unclear optimizing for overlapping range deletes is worth the effort.
	 */
	if (ref->state != WT_REF_DISK ||
	    !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * We cannot fast-delete pages that have overflow key/value items as
	 * the overflow blocks have to be discarded.  The way we figure that
	 * out is to check the page's cell type, cells for leaf pages without
	 * overflow items are special.
	 *
	 * To look at an on-page cell, we need to look at the parent page, and
	 * that's dangerous, our parent page could change without warning if
	 * the parent page were to split, deepening the tree.  It's safe: the
	 * page's reference will always point to some valid page, and if we find
	 * any problems we simply fail the fast-delete optimization.
	 */
	parent = ref->home;
	if (__wt_off_page(parent, ref->addr) ?
	    ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO :
	    __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
		goto err;

	/*
	 * This action dirties the parent page: mark it dirty now, there's no
	 * future reconciliation of the child leaf page that will dirty it as
	 * we write the tree.
	 */
	WT_ERR(__wt_page_parent_modify_set(session, ref, false));

	/*
	 * Record the change in the transaction structure and set the change's
	 * transaction ID.
	 */
	WT_ERR(__wt_calloc_one(session, &ref->page_del));
	ref->page_del->txnid = session->txn.id;

	WT_ERR(__wt_txn_modify_ref(session, ref));

	*skipp = true;
	WT_STAT_FAST_CONN_INCR(session, rec_page_delete_fast);
	WT_STAT_FAST_DATA_INCR(session, rec_page_delete_fast);
	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (0);

err:	__wt_free(session, ref->page_del);

	/*
	 * Restore the page to on-disk status, we'll have to instantiate it.
	 */
	WT_PUBLISH(ref->state, WT_REF_DISK);
	return (ret);
}
コード例 #10
0
ファイル: block_write.c プロジェクト: rain10154/wiredtiger
/*将buffer的数据写入到block对应的文件中,并计算checksum和size*/
int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, 
						uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	size_t align_size;
	wt_off_t offset;
	int local_locked;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	fh = block->fh;
	local_locked = 0;

	/*buf不是对齐模式,不能进行写,因为这个是和磁盘相关的写入,必须是对齐的*/
	if(!F_ISSET(buf, WT_ITEM_ALIGNED)){
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated");
	}

	/*计算buf->size按block对齐,对齐后有可能会比现有的buf->memsize大,如果大的话,不能进行写,有可能会缓冲区溢出*/
	align_size = WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated");
	}
	/*超过4G*/
	if (align_size > UINT32_MAX) {
		WT_ASSERT(session, align_size <= UINT32_MAX);
		WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write");
	}

	/*将对其后pading的buffer位置进行清0*/
	memset((uint8_t*)buf->mem + buf->size, 0, align_size - buf->size);

	/*设置block header,计算存储的数据长度*/
	blk->disk_size = WT_STORE_SIZE(align_size);
	blk->flags = 0;
	if(data_cksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);

	/*计算buf的cksum*/
	blk->cksum = __wt_cksum(buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);

	if (!caller_locked) {
		WT_RET(__wt_block_ext_prealloc(session, 5));
		__wt_spin_lock(session, &block->live_lock);
		local_locked = 1;
	}

	ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);
	/*判断文件是否需要进行扩大,如果不扩大就有可能存不下写入的block数据*/
	if(ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size ||
		(offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))){
			/*调整extend_size为原来的offset + extend_len的两倍*/
			fh->extend_size = offset + fh->extend_len * 2;
			if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) {
				/*释放block->live_lock的自旋锁,因为重设文件大小会时间比较长,需要先释放自旋锁,防止CPU空转*/
				if (!fh->fallocate_requires_locking && local_locked) {
					__wt_spin_unlock(session, &block->live_lock);
					local_locked = 0;
				}

				/*扩大文件的占用空间*/
				if ((ret = __wt_fallocate(session,fh, offset, fh->extend_len * 2)) == ENOTSUP) {
					ret = 0;
					goto extend_truncate;
				}
			}
			else{
extend_truncate:
				if (!caller_locked && local_locked == 0) {
					__wt_spin_lock(session, &block->live_lock);
					local_locked = 1;
				}
				/*直接调整文件大小,这个比__wt_fallocate更慢*/
				if ((ret = __wt_ftruncate(session, fh, offset + fh->extend_len * 2)) == EBUSY)
					ret = 0;
			}
	}

	if(local_locked){
		__wt_spin_unlock(session, &block->live_lock);
		local_locked = 0;
	}

	WT_RET(ret);
	/*进行block的数据写入*/
	ret =__wt_write(session, fh, offset, align_size, buf->mem);
	if (ret != 0) {
		if (!caller_locked)
			__wt_spin_lock(session, &block->live_lock);
		/*没写成功,将ext对应的数据返回给avail list*/
		WT_TRET(__wt_block_off_free(session, block, offset, (wt_off_t)align_size));
		if (!caller_locked)
			__wt_spin_unlock(session, &block->live_lock);

		WT_RET(ret);
	}

#ifdef HAVE_SYNC_FILE_RANGE
	/*需要进行fsync操作,脏页太多,进行一次异步刷盘*/
	if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) {
			block->os_cache_dirty = 0;
			WT_RET(__wt_fsync_async(session, fh));
	}
#endif

#ifdef HAVE_POSIX_FADVISE
	/*清理fh->fd文件对应的system page cache中的数据,这个过程可能会有IO操作,相当于同步的sync调用*/
	if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) {
		block->os_cache = 0;
		if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name);
	}
#endif

	WT_STAT_FAST_CONN_INCR(session, block_write);
	WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);

	WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, 
							(uintmax_t)offset, (uintmax_t)align_size, blk->cksum));

	*offsetp = offset;
	*sizep = WT_STORE_SIZE(align_size);
	*cksump = blk->cksum;

	return ret;
}
コード例 #11
0
ファイル: bt_io.c プロジェクト: GYGit/mongo
/*
 * __wt_bt_read --
 *	Read a cookie referenced block into a buffer.
 */
int
__wt_bt_read(WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_ITEM(etmp);
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_ENCRYPTOR *encryptor;
	WT_ITEM *ip;
	const WT_PAGE_HEADER *dsk;
	const char *fail_msg;
	size_t result_len;

	btree = S2BT(session);
	bm = btree->bm;
	fail_msg = NULL;			/* -Wuninitialized */

	/*
	 * If anticipating a compressed or encrypted block, read into a scratch
	 * buffer and decompress into the caller's buffer.  Else, read directly
	 * into the caller's buffer.
	 */
	if (btree->compressor == NULL && btree->kencryptor == NULL) {
		WT_RET(bm->read(bm, session, buf, addr, addr_size));
		dsk = buf->data;
		ip = NULL;
	} else {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
		dsk = tmp->data;
		ip = tmp;
	}

	/*
	 * If the block is encrypted, copy the skipped bytes of the original
	 * image into place, then decrypt.
	 */
	if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) {
		if (btree->kencryptor == NULL ||
		    (encryptor = btree->kencryptor->encryptor) == NULL ||
		    encryptor->decrypt == NULL) {
			fail_msg =
			    "encrypted block in file for which no encryption "
			    "configured";
			goto corrupt;
		}

		WT_ERR(__wt_scr_alloc(session, 0, &etmp));
		if ((ret = __wt_decrypt(session,
		    encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) {
			fail_msg = "block decryption failed";
			goto corrupt;
		}

		ip = etmp;
		dsk = ip->data;
	} else if (btree->kencryptor != NULL) {
		fail_msg =
		    "unencrypted block in file for which encryption configured";
		goto corrupt;
	}

	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
		if (btree->compressor == NULL ||
		    btree->compressor->decompress == NULL) {
			fail_msg =
			    "compressed block in file for which no compression "
			    "configured";
			goto corrupt;
		}

		/*
		 * Size the buffer based on the in-memory bytes we're expecting
		 * from decompression.
		 */
		WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size));

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, ip->data, WT_BLOCK_COMPRESS_SKIP);
		ret = btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)ip->data + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len);

		/*
		 * If checksums were turned off because we're depending on the
		 * decompression to fail on any corrupted data, we'll end up
		 * here after corruption happens.  If we're salvaging the file,
		 * it's OK, otherwise it's really, really bad.
		 */
		if (ret != 0 ||
		    result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) {
			fail_msg = "block decompression failed";
			goto corrupt;
		}
	} else
		/*
		 * If we uncompressed above, the page is in the correct buffer.
		 * If we get here the data may be in the wrong buffer and the
		 * buffer may be the wrong size.  If needed, get the page
		 * into the destination buffer.
		 */
		if (ip != NULL)
			WT_ERR(__wt_buf_set(
			    session, buf, ip->data, dsk->mem_size));

	/* If the handle is a verify handle, verify the physical page. */
	if (F_ISSET(btree, WT_BTREE_VERIFY)) {
		if (tmp == NULL)
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
		WT_ERR(__wt_verify_dsk(session, tmp->data, buf));
	}

	WT_STAT_FAST_CONN_INCR(session, cache_read);
	WT_STAT_FAST_DATA_INCR(session, cache_read);
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
		WT_STAT_FAST_DATA_INCR(session, compress_read);
	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);

	if (0) {
corrupt:	if (ret == 0)
			ret = WT_ERROR;
		if (!F_ISSET(btree, WT_BTREE_VERIFY) &&
		    !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) {
			__wt_err(session, ret, "%s", fail_msg);
			ret = __wt_illegal_value(session, btree->dhandle->name);
		}
	}

err:	__wt_scr_free(session, &tmp);
	__wt_scr_free(session, &etmp);
	return (ret);
}
コード例 #12
0
ファイル: os_mtx_cond.c プロジェクト: Zhangwusheng/wiredtiger
/*
 * __wt_cond_wait_signal --
 *	Wait on a mutex, optionally timing out.  If we get it
 *	before the time out period expires, let the caller know.
 */
int
__wt_cond_wait_signal(
    WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled)
{
	DWORD err, milliseconds;
	WT_DECL_RET;
	uint64_t milliseconds64;
	bool locked;

	locked = false;

	/* Fast path if already signalled. */
	*signalled = true;
	if (__wt_atomic_addi32(&cond->waiters, 1) == 0)
		return (0);

	/*
	 * !!!
	 * This function MUST handle a NULL session handle.
	 */
	if (session != NULL) {
		WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
			"wait %s cond (%p)", cond->name, cond));
		WT_STAT_FAST_CONN_INCR(session, cond_wait);
	}

	EnterCriticalSection(&cond->mtx);
	locked = true;

	if (usecs > 0) {
		milliseconds64 = usecs / 1000;

		/*
		 * Check for 32-bit unsigned integer overflow
		 * INFINITE is max unsigned int on Windows
		 */
		if (milliseconds64 >= INFINITE)
			milliseconds64 = INFINITE - 1;
		milliseconds = (DWORD)milliseconds64;

		/*
		 * 0 would mean the CV sleep becomes a TryCV which we do not
		 * want
		 */
		if (milliseconds == 0)
			milliseconds = 1;

		ret = SleepConditionVariableCS(
		    &cond->cond, &cond->mtx, milliseconds);
	} else
		ret = SleepConditionVariableCS(
		    &cond->cond, &cond->mtx, INFINITE);

	/*
	 * SleepConditionVariableCS returns non-zero on success, 0 on timeout
	 * or failure. Check for timeout, else convert to a WiredTiger error
	 * value and fail.
	 */
	if (ret == 0) {
		if ((err = GetLastError()) == ERROR_TIMEOUT)
			*signalled = false;
		else
			ret = __wt_errno();
	} else
		ret = 0;

	(void)__wt_atomic_subi32(&cond->waiters, 1);

	if (locked)
		LeaveCriticalSection(&cond->mtx);

	if (ret == 0)
		return (0);
	WT_RET_MSG(session, ret, "SleepConditionVariableCS");
}
コード例 #13
0
ファイル: os_open.c プロジェクト: RolfAndreassen/wiredtiger
/*
 * __wt_open --
 *	Open a file handle.
 */
int
__wt_open(WT_SESSION_IMPL *session,
    const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *fh, *tfh;
	mode_t mode;
	int direct_io, f, fd, matched;
	const char *path;

	conn = S2C(session);
	fh = NULL;
	fd = -1;
	path = NULL;

	WT_VERBOSE_RET(session, fileops, "%s: open", name);

	/* Increment the reference count if we already have the file open. */
	matched = 0;
	__wt_spin_lock(session, &conn->fh_lock);
	TAILQ_FOREACH(tfh, &conn->fhqh, q)
		if (strcmp(name, tfh->name) == 0) {
			++tfh->refcnt;
			*fhp = tfh;
			matched = 1;
			break;
		}
	__wt_spin_unlock(session, &conn->fh_lock);
	if (matched)
		return (0);

	WT_RET(__wt_filename(session, name, &path));

	f = O_RDWR;
#ifdef O_BINARY
	/* Windows clones: we always want to treat the file as a binary. */
	f |= O_BINARY;
#endif
#ifdef O_CLOEXEC
	/*
	 * Security:
	 * The application may spawn a new process, and we don't want another
	 * process to have access to our file handles.
	 */
	f |= O_CLOEXEC;
#endif
#ifdef O_NOATIME
	/* Avoid updating metadata for read-only workloads. */
	if (dio_type == WT_FILE_TYPE_DATA)
		f |= O_NOATIME;
#endif

	if (ok_create) {
		f |= O_CREAT;
		if (exclusive)
			f |= O_EXCL;
		mode = 0666;
	} else
		mode = 0;

	direct_io = 0;
#ifdef O_DIRECT
	if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
		f |= O_DIRECT;
		direct_io = 1;
	}
#endif
	if (dio_type == WT_FILE_TYPE_LOG &&
	    FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC))
#ifdef O_DSYNC
		f |= O_DSYNC;
#elif defined(O_SYNC)
		f |= O_SYNC;
#else
		WT_ERR_MSG(session, ENOTSUP,
		    "Unsupported log sync mode requested");
#endif
	WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret);
	if (ret != 0)
		WT_ERR_MSG(session, ret,
		    direct_io ?
		    "%s: open failed with direct I/O configured, some "
		    "filesystem types do not support direct I/O" : "%s", path);

#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
	/*
	 * Security:
	 * The application may spawn a new process, and we don't want another
	 * process to have access to our file handles.  There's an obvious
	 * race here, so we prefer the flag to open if available.
	 */
	if ((f = fcntl(fd, F_GETFD)) == -1 ||
	    fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1)
		WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name);
#endif

#if defined(HAVE_POSIX_FADVISE)
	/* Disable read-ahead on trees: it slows down random read workloads. */
	if (dio_type == WT_FILE_TYPE_DATA)
		WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM));
#endif

	if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
		WT_ERR(__open_directory_sync(session));

	WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
	WT_ERR(__wt_strdup(session, name, &fh->name));
	fh->fd = fd;
	fh->refcnt = 1;
	fh->direct_io = direct_io;

	/* Set the file's size. */
	WT_ERR(__wt_filesize(session, fh, &fh->size));

	/* Configure file extension. */
	if (dio_type == WT_FILE_TYPE_DATA)
		fh->extend_len = conn->data_extend_len;

	/*
	 * Repeat the check for a match, but then link onto the database's list
	 * of files.
	 */
	matched = 0;
	__wt_spin_lock(session, &conn->fh_lock);
	TAILQ_FOREACH(tfh, &conn->fhqh, q)
		if (strcmp(name, tfh->name) == 0) {
			++tfh->refcnt;
			*fhp = tfh;
			matched = 1;
			break;
		}
	if (!matched) {
		TAILQ_INSERT_TAIL(&conn->fhqh, fh, q);
		WT_STAT_FAST_CONN_INCR(session, file_open);

		*fhp = fh;
	}
	__wt_spin_unlock(session, &conn->fh_lock);
	if (matched) {
err:		if (fh != NULL) {
			__wt_free(session, fh->name);
			__wt_free(session, fh);
		}
		if (fd != -1)
			(void)close(fd);
	}

	__wt_free(session, path);
	return (ret);
}
コード例 #14
0
ファイル: bt_io.c プロジェクト: ezhangle/node-wiredtigerdown
/*
 * __wt_bt_read --
 *	Read a cookie referenced block into a buffer.
 */
int
__wt_bt_read(WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	const WT_PAGE_HEADER *dsk;
	size_t result_len;

	btree = S2BT(session);
	bm = btree->bm;

	/*
	 * If anticipating a compressed block, read into a scratch buffer and
	 * decompress into the caller's buffer.  Else, read directly into the
	 * caller's buffer.
	 */
	if (btree->compressor == NULL) {
		WT_RET(bm->read(bm, session, buf, addr, addr_size));
		dsk = buf->data;
	} else {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
		dsk = tmp->data;
	}

	/*
	 * If the block is compressed, copy the skipped bytes of the original
	 * image into place, then decompress.
	 */
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
		if (btree->compressor == NULL ||
		    btree->compressor->decompress == NULL)
			WT_ERR_MSG(session, WT_ERROR,
			    "read compressed block where no compression engine "
			    "configured");

		/*
		 * We're allocating the exact number of bytes we're expecting
		 * from decompression.
		 */
		WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size));

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));

		/*
		 * If checksums were turned off because we're depending on the
		 * decompression to fail on any corrupted data, we'll end up
		 * here after corruption happens.  If we're salvaging the file,
		 * it's OK, otherwise it's really, really bad.
		 */
		if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
			WT_ERR(
			    F_ISSET(btree, WT_BTREE_VERIFY) ||
			    F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
			    WT_ERROR :
			    __wt_illegal_value(session, btree->dhandle->name));
	} else
		if (btree->compressor == NULL)
			buf->size = dsk->mem_size;
		else
			/*
			 * We guessed wrong: there was a compressor, but this
			 * block was not compressed, and now the page is in the
			 * wrong buffer and the buffer may be of the wrong size.
			 * This should be rare, but happens with small blocks
			 * that aren't worth compressing.
			 */
			WT_ERR(__wt_buf_set(
			    session, buf, tmp->data, dsk->mem_size));

	/* If the handle is a verify handle, verify the physical page. */
	if (F_ISSET(btree, WT_BTREE_VERIFY)) {
		if (tmp == NULL)
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
		WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
	}

	WT_STAT_FAST_CONN_INCR(session, cache_read);
	WT_STAT_FAST_DATA_INCR(session, cache_read);
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
		WT_STAT_FAST_DATA_INCR(session, compress_read);
	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);

err:	__wt_scr_free(&tmp);
	return (ret);
}
コード例 #15
0
ファイル: conn_log.c プロジェクト: alabid/mongo
/*
 * __wt_log_wrlsn --
 *	Process written log slots and attempt to coalesce them if the LSNs
 *	are contiguous.  Returns 1 if slots were freed, 0 if no slots were
 *	freed in the progress arg.  Must be called with the log slot lock held.
 */
int
__wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL];
	WT_LOGSLOT *coalescing, *slot;
	size_t written_i;
	uint32_t i, save_i;

	conn = S2C(session);
	log = conn->log;
	coalescing = NULL;
	written_i = 0;
	i = 0;
	if (free_i != NULL)
		*free_i = WT_SLOT_POOL;

	/*
	 * Walk the array once saving any slots that are in the
	 * WT_LOG_SLOT_WRITTEN state.
	 */
	while (i < WT_SLOT_POOL) {
		save_i = i;
		slot = &log->slot_pool[i++];
		if (free_i != NULL && *free_i == WT_SLOT_POOL &&
		    slot->slot_state == WT_LOG_SLOT_FREE)
			*free_i = save_i;
		if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
			continue;
		written[written_i].slot_index = save_i;
		written[written_i++].lsn = slot->slot_release_lsn;
	}
	/*
	 * If we found any written slots process them.  We sort them
	 * based on the release LSN, and then look for them in order.
	 */
	if (written_i > 0) {
		/*
		 * If wanted, reset the yield variable to indicate that we
		 * have found written slots.
		 */
		if (yield != NULL)
			*yield = 0;
		WT_INSERTION_SORT(written, written_i,
		    WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT);

		/*
		 * We know the written array is sorted by LSN.  Go
		 * through them either advancing write_lsn or coalesce
		 * contiguous ranges of written slots.
		 */
		for (i = 0; i < written_i; i++) {
			slot = &log->slot_pool[written[i].slot_index];
			if (coalescing != NULL) {
				if (WT_LOG_CMP(&coalescing->slot_end_lsn,
				    &written[i].lsn) != 0) {
					coalescing = slot;
					continue;
				}
				/*
				 * If we get here we have a slot to coalesce
				 * and free.
				 */
				coalescing->slot_end_lsn = slot->slot_end_lsn;
				WT_STAT_FAST_CONN_INCR(
				    session, log_slot_coalesced);
				/*
				 * Copy the flag for later closing.
				 */
				if (F_ISSET(slot, WT_SLOT_CLOSEFH))
					F_SET(coalescing, WT_SLOT_CLOSEFH);
			} else {
				/*
				 * If this written slot is not the next LSN,
				 * try to start coalescing with later slots.
				 */
				if (WT_LOG_CMP(
				    &log->write_lsn, &written[i].lsn) != 0) {
					coalescing = slot;
					continue;
				}
				/*
				 * If we get here we have a slot to process.
				 * Advance the LSN and process the slot.
				 */
				WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn,
				    &slot->slot_release_lsn) == 0);
				log->write_start_lsn = slot->slot_start_lsn;
				log->write_lsn = slot->slot_end_lsn;
				WT_RET(__wt_cond_signal(
				    session, log->log_write_cond));
				WT_STAT_FAST_CONN_INCR(session, log_write_lsn);
				/*
				 * Signal the close thread if needed.
				 */
				if (F_ISSET(slot, WT_SLOT_CLOSEFH))
					WT_RET(__wt_cond_signal(
					    session, conn->log_file_cond));
			}
			WT_RET(__wt_log_slot_free(session, slot));
			if (free_i != NULL && *free_i == WT_SLOT_POOL &&
			    slot->slot_state == WT_LOG_SLOT_FREE)
				*free_i = save_i;
		}
	}
	return (0);
}
コード例 #16
0
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	int busy, force_attempts, oldgen;

	for (force_attempts = oldgen = 0, wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, attempt to read it.
			 * Make sure there is space in the cache.
			 */
			WT_RET(__wt_cache_full_check(session));
			WT_RET(__wt_cache_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory: get a hazard pointer, update
			 * the page's LRU and return.  The expected reason we
			 * can't get a hazard pointer is because the page is
			 * being evicted; yield and try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			page = ref->page;
			WT_ASSERT(session, page != NULL);

			/* Forcibly evict pages that are too big. */
			if (!LF_ISSET(WT_READ_NO_EVICT) &&
			    force_attempts < 10 &&
			    __evict_force_check(session, page)) {
				++force_attempts;
				if ((ret = __wt_page_release_busy(
				    session, ref, flags)) == EBUSY) {
					/* If forced eviction fails, stall. */
					ret = 0;
					wait_cnt += 1000;
				} else
					WT_RET(ret);
				WT_STAT_FAST_CONN_INCR(
				    session, page_forcible_evict_blocked);
				break;
			}

			/* Check if we need an autocommit transaction. */
			if ((ret = __wt_txn_autocommit_check(session)) != 0) {
				WT_TRET(__wt_hazard_clear(session, page));
				return (ret);
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_set(session);

			return (0);
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (++wait_cnt < 1000)
			__wt_yield();
		else {
			sleep_cnt = WT_MIN(wait_cnt, 10000);
			wait_cnt *= 2;
			WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
			__wt_sleep(0, sleep_cnt);
		}
	}
}
コード例 #17
0
ファイル: log_slot.c プロジェクト: Jaryli/mongo
/*
 * __wt_log_slot_join --
 *	Join a consolidated logging slot.  Must be called with
 *	the read lock held.
 */
void
__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
    uint32_t flags, WT_MYSLOT *myslot)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	WT_LOGSLOT *slot;
	int64_t flag_state, new_state, old_state, released;
	int32_t join_offset, new_join;
#ifdef	HAVE_DIAGNOSTIC
	bool unbuf_force;
#endif

	conn = S2C(session);
	log = conn->log;

	/*
	 * Make sure the length cannot overflow.  The caller should not
	 * even call this function if it doesn't fit but use direct
	 * writes.
	 */
	WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));

	/*
	 * There should almost always be a slot open.
	 */
#ifdef	HAVE_DIAGNOSTIC
	unbuf_force = (++log->write_calls % 1000) == 0;
#endif
	for (;;) {
		WT_BARRIER();
		slot = log->active_slot;
		old_state = slot->slot_state;
		/*
		 * Try to join our size into the existing size and
		 * atomically write it back into the state.
		 */
		flag_state = WT_LOG_SLOT_FLAGS(old_state);
		released = WT_LOG_SLOT_RELEASED(old_state);
		join_offset = WT_LOG_SLOT_JOINED(old_state);
#ifdef	HAVE_DIAGNOSTIC
		if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) {
#else
		if (mysize > WT_LOG_SLOT_BUF_MAX) {
#endif
			new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
			F_SET(myslot, WT_MYSLOT_UNBUFFERED);
			myslot->slot = slot;
		} else
			new_join = join_offset + (int32_t)mysize;
		new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
		    (int64_t)new_join, (int64_t)released, (int64_t)flag_state);

		/*
		 * Check if the slot is open for joining and we are able to
		 * swap in our size into the state.
		 */
		if (WT_LOG_SLOT_OPEN(old_state) &&
		    __wt_atomic_casiv64(
		    &slot->slot_state, old_state, new_state))
			break;
		/*
		 * The slot is no longer open or we lost the race to
		 * update it.  Yield and try again.
		 */
		WT_STAT_FAST_CONN_INCR(session, log_slot_races);
		__wt_yield();
	}
	/*
	 * We joined this slot.  Fill in our information to return to
	 * the caller.
	 */
	if (mysize != 0)
		WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
		F_SET(slot, WT_SLOT_SYNC_DIR);
	if (LF_ISSET(WT_LOG_FLUSH))
		F_SET(slot, WT_SLOT_FLUSH);
	if (LF_ISSET(WT_LOG_FSYNC))
		F_SET(slot, WT_SLOT_SYNC);
	if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) {
		WT_ASSERT(session, slot->slot_unbuffered == 0);
		WT_STAT_FAST_CONN_INCR(session, log_slot_unbuffered);
		slot->slot_unbuffered = (int64_t)mysize;
	}
	myslot->slot = slot;
	myslot->offset = join_offset;
	myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize);
}

/*
 * __wt_log_slot_release --
 *	Each thread in a consolidated group releases its portion to
 *	signal it has completed copying its piece of the log into
 *	the memory buffer.
 */
int64_t
__wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size)
{
	WT_LOGSLOT *slot;
	wt_off_t cur_offset, my_start;
	int64_t my_size, rel_size;

	WT_UNUSED(session);
	slot = myslot->slot;
	my_start = slot->slot_start_offset + myslot->offset;
	while ((cur_offset = slot->slot_last_offset) < my_start) {
		/*
		 * Set our offset if we are larger.
		 */
		if (__wt_atomic_casiv64(
		    &slot->slot_last_offset, cur_offset, my_start))
			break;
		/*
		 * If we raced another thread updating this, try again.
		 */
		WT_BARRIER();
	}
	/*
	 * Add my size into the state and return the new size.
	 */
	rel_size = size;
	if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
		rel_size = WT_LOG_SLOT_UNBUFFERED;
	my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0);
	return (__wt_atomic_addiv64(&slot->slot_state, my_size));
}
コード例 #18
0
ファイル: bt_cursor.c プロジェクト: radik/mongo
/*
 * __wt_btcur_search_near --
 *	Search for a record in the tree.
 */
int
__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	int exact, valid;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	upd = NULL;					/* -Wuninitialized */
	exact = 0;

	WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
	WT_STAT_FAST_DATA_INCR(session, cursor_search_near);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));

	/*
	 * If we have a row-store page pinned, search it; if we don't have a
	 * page pinned, or the search of the pinned page doesn't find an exact
	 * match, search from the root. Unlike WT_CURSOR.search, ignore pinned
	 * pages in the case of column-store, search-near isn't an interesting
	 * enough case for column-store to add the complexity needed to avoid
	 * the tree search.
	 *
	 * Set the "insert" flag for the btree row-store search; we may intend
	 * to position the cursor at the end of the tree, rather than match an
	 * existing record.
	 */
	valid = 0;
	if (btree->type == BTREE_ROW &&
	    F_ISSET(cbt, WT_CBT_ACTIVE) &&
	    cbt->ref->page->read_gen != WT_READGEN_OLDEST) {
		__wt_txn_cursor_op(session);

		WT_ERR(__cursor_row_search(session, cbt, cbt->ref, 1));

		/*
		 * Search-near is trickier than search when searching an already
		 * pinned page. If search returns the first or last page slots,
		 * discard the results and search the full tree as the neighbor
		 * pages might offer better matches. This test is simplistic as
		 * we're ignoring append lists (there may be no page slots or we
		 * might be legitimately positioned after the last page slot).
		 * Ignore those cases, it makes things too complicated.
		 */
		if (cbt->slot != 0 &&
		    cbt->slot != cbt->ref->page->pg_row_entries - 1)
			valid = __cursor_valid(cbt, &upd);
	}
	if (!valid) {
		WT_ERR(__cursor_func_init(cbt, 1));
		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, 1) :
		    __cursor_col_search(session, cbt, NULL));
		valid = __cursor_valid(cbt, &upd);
	}

	/*
	 * If we find a valid key, return it.
	 *
	 * Else, creating a record past the end of the tree in a fixed-length
	 * column-store implicitly fills the gap with empty records.  In this
	 * case, we instantiate the empty record, it's an exact match.
	 *
	 * Else, move to the next key in the tree (bias for prefix searches).
	 * Cursor next skips invalid rows, so we don't have to test for them
	 * again.
	 *
	 * Else, redo the search and move to the previous key in the tree.
	 * Cursor previous skips invalid rows, so we don't have to test for
	 * them again.
	 *
	 * If that fails, quit, there's no record to return.
	 */
	if (valid) {
		exact = cbt->compare;
		ret = __wt_kv_return(session, cbt, upd);
	} else if (__cursor_fix_implicit(btree, cbt)) {
		cbt->recno = cursor->recno;
		cbt->v = 0;
		cursor->value.data = &cbt->v;
		cursor->value.size = 1;
		exact = 0;
	} else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND)
		exact = 1;
	else {
		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, 1) :
		    __cursor_col_search(session, cbt, NULL));
		if (__cursor_valid(cbt, &upd)) {
			exact = cbt->compare;
			ret = __wt_kv_return(session, cbt, upd);
		} else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND)
			exact = -1;
	}

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND))
		*exactp = exact;
	return (ret);
}
コード例 #19
0
ファイル: log_slot.c プロジェクト: Jaryli/mongo
/*
 * __log_slot_close --
 *	Close out the slot the caller is using.  The slot may already be
 *	closed or freed by another thread.
 */
static int
__log_slot_close(
    WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *releasep, bool forced)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	int64_t end_offset, new_state, old_state;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
	WT_ASSERT(session, releasep != NULL);
	conn = S2C(session);
	log = conn->log;
	*releasep = 0;
	if (slot == NULL)
		return (WT_NOTFOUND);
retry:
	old_state = slot->slot_state;
	/*
	 * If this close is coming from a forced close and a thread is in
	 * the middle of using the slot, return EBUSY.  The caller can
	 * decide if retrying is necessary or not.
	 */
	if (forced && WT_LOG_SLOT_INPROGRESS(old_state))
		return (EBUSY);
	/*
	 * If someone else is switching out this slot we lost.  Nothing to
	 * do but return.  Return WT_NOTFOUND anytime the given slot was
	 * processed by another closing thread.  Only return 0 when we
	 * actually closed the slot.
	 */
	if (WT_LOG_SLOT_CLOSED(old_state))
		return (WT_NOTFOUND);
	/*
	 * If someone completely processed this slot, we're done.
	 */
	if (FLD64_ISSET((uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED))
		return (WT_NOTFOUND);
	new_state = (old_state | WT_LOG_SLOT_CLOSE);
	/*
	 * Close this slot.  If we lose the race retry.
	 */
	if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state))
		goto retry;
	/*
	 * We own the slot now.  No one else can join.
	 * Set the end LSN.
	 */
	WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
	if (WT_LOG_SLOT_DONE(new_state))
		*releasep = 1;
	slot->slot_end_lsn = slot->slot_start_lsn;
	end_offset =
	    WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered;
	slot->slot_end_lsn.offset += (wt_off_t)end_offset;
	WT_STAT_FAST_CONN_INCRV(session,
	    log_slot_consolidated, end_offset);
	/*
	 * XXX Would like to change so one piece of code advances the LSN.
	 */
	log->alloc_lsn = slot->slot_end_lsn;
	WT_ASSERT(session, log->alloc_lsn.file >= log->write_lsn.file);
	return (0);
}
コード例 #20
0
ファイル: bt_cursor.c プロジェクト: radik/mongo
/*
 * __wt_btcur_remove --
 *	Remove a record from the tree.
 */
int
__wt_btcur_remove(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_STAT_FAST_CONN_INCR(session, cursor_remove);
	WT_STAT_FAST_DATA_INCR(session, cursor_remove);
	WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));

retry:	WT_RET(__cursor_func_init(cbt, 1));

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		WT_ERR(__cursor_col_search(session, cbt, NULL));

		/*
		 * If we find a matching record, check whether an update would
		 * conflict.  Do this before checking if the update is visible
		 * in __cursor_valid, or we can miss conflict.
		 */
		WT_ERR(__curfile_update_check(cbt));

		/* Remove the record if it exists. */
		if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) {
			if (!__cursor_fix_implicit(btree, cbt))
				WT_ERR(WT_NOTFOUND);
			/*
			 * Creating a record past the end of the tree in a
			 * fixed-length column-store implicitly fills the
			 * gap with empty records.  Return success in that
			 * case, the record was deleted successfully.
			 *
			 * Correct the btree cursor's location: the search
			 * will have pointed us at the previous/next item,
			 * and that's not correct.
			 */
			cbt->recno = cursor->recno;
		} else
			ret = __cursor_col_modify(session, cbt, 1);
		break;
	case BTREE_ROW:
		/* Remove the record if it exists. */
		WT_ERR(__cursor_row_search(session, cbt, NULL, 0));

		/* Check whether an update would conflict. */
		WT_ERR(__curfile_update_check(cbt));

		if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
			WT_ERR(WT_NOTFOUND);

		ret = __cursor_row_modify(session, cbt, 1);
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

err:	if (ret == WT_RESTART) {
		WT_STAT_FAST_CONN_INCR(session, cursor_restart);
		WT_STAT_FAST_DATA_INCR(session, cursor_restart);
		goto retry;
	}
	/*
	 * If the cursor is configured to overwrite and the record is not
	 * found, that is exactly what we want.
	 */
	if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND)
		ret = 0;

	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));

	return (ret);
}
コード例 #21
0
ファイル: os_mtx_cond.c プロジェクト: ChineseDr/mongo
/*
 * __wt_cond_wait_signal --
 *	Wait on a mutex, optionally timing out.  If we get it
 *	before the time out period expires, let the caller know.
 */
int
__wt_cond_wait_signal(
    WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled)
{
    BOOL sleepret;
    DWORD milliseconds, windows_error;
    bool locked;
    uint64_t milliseconds64;

    locked = false;

    /* Fast path if already signalled. */
    *signalled = true;
    if (__wt_atomic_addi32(&cond->waiters, 1) == 0)
        return (0);

    /*
     * !!!
     * This function MUST handle a NULL session handle.
     */
    if (session != NULL) {
        WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
                            "wait %s cond (%p)", cond->name, cond));
        WT_STAT_FAST_CONN_INCR(session, cond_wait);
    }

    EnterCriticalSection(&cond->mtx);
    locked = true;

    if (usecs > 0) {
        milliseconds64 = usecs / 1000;

        /*
         * Check for 32-bit unsigned integer overflow
         * INFINITE is max unsigned int on Windows
         */
        if (milliseconds64 >= INFINITE)
            milliseconds64 = INFINITE - 1;
        milliseconds = (DWORD)milliseconds64;

        /*
         * 0 would mean the CV sleep becomes a TryCV which we do not
         * want
         */
        if (milliseconds == 0)
            milliseconds = 1;

        sleepret = SleepConditionVariableCS(
                       &cond->cond, &cond->mtx, milliseconds);
    } else
        sleepret = SleepConditionVariableCS(
                       &cond->cond, &cond->mtx, INFINITE);

    /*
     * SleepConditionVariableCS returns non-zero on success, 0 on timeout
     * or failure.
     */
    if (sleepret == 0) {
        windows_error = __wt_getlasterror();
        if (windows_error == ERROR_TIMEOUT) {
            *signalled = false;
            sleepret = 1;
        }
    }

    (void)__wt_atomic_subi32(&cond->waiters, 1);

    if (locked)
        LeaveCriticalSection(&cond->mtx);

    if (sleepret != 0)
        return (0);

    __wt_errx(session, "SleepConditionVariableCS: %s",
              __wt_formatmessage(session, windows_error));
    return (__wt_map_windows_error(windows_error));
}
コード例 #22
0
ファイル: bt_curnext.c プロジェクト: EaseTech/wiredtiger
/*
 * __wt_btcur_next --
 *	Move to the next record in the tree.
 */
int
__wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	uint32_t flags;
	int newpage;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_STAT_FAST_CONN_INCR(session, cursor_next);
	WT_STAT_FAST_DATA_INCR(session, cursor_next);

	flags = WT_READ_SKIP_INTL;			/* Tree walk flags. */
	if (truncating)
		LF_SET(WT_READ_TRUNCATE);

	WT_RET(__cursor_func_init(cbt, 0));

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT))
		__wt_btcur_iterate_setup(cbt, 1);

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the next page, until we reach the end of the
	 * file.
	 */
	page = cbt->ref == NULL ? NULL : cbt->ref->page;
	for (newpage = 0;; newpage = 1) {
		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_next(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_next(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
		} else if (page != NULL) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_next(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_next(cbt, newpage);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_next(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;

			/*
			 * The last page in a column-store has appended entries.
			 * We handle it separately from the usual cursor code:
			 * it's only that one page and it's in a simple format.
			 */
			if (page->type != WT_PAGE_ROW_LEAF &&
			    (cbt->ins_head = WT_COL_APPEND(page)) != NULL) {
				F_SET(cbt, WT_CBT_ITERATE_APPEND);
				continue;
			}
		}

		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
		page = cbt->ref->page;
		WT_ASSERT(session,
		    page->type != WT_PAGE_COL_INT &&
		    page->type != WT_PAGE_ROW_INT);
	}

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
コード例 #23
0
ファイル: bt_read.c プロジェクト: doubaokun/wiredtiger
/*
 * __page_read --
 *	Read a page from the file.
 */
static int
__page_read(WT_SESSION_IMPL *session, WT_REF *ref)
{
    const WT_PAGE_HEADER *dsk;
    WT_BTREE *btree;
    WT_DECL_RET;
    WT_ITEM tmp;
    WT_PAGE *page;
    size_t addr_size;
    uint32_t previous_state;
    const uint8_t *addr;

    btree = S2BT(session);
    page = NULL;

    /*
     * Don't pass an allocated buffer to the underlying block read function,
     * force allocation of new memory of the appropriate size.
     */
    WT_CLEAR(tmp);

    /*
     * Attempt to set the state to WT_REF_READING for normal reads, or
     * WT_REF_LOCKED, for deleted pages.  If successful, we've won the
     * race, read the page.
     */
    if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING))
        previous_state = WT_REF_DISK;
    else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
        previous_state = WT_REF_DELETED;
    else
        return (0);

    /*
     * Get the address: if there is no address, the page was deleted, but a
     * subsequent search or insert is forcing re-creation of the name space.
     */
    WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
    if (addr == NULL) {
        WT_ASSERT(session, previous_state == WT_REF_DELETED);

        WT_ERR(__wt_btree_new_leaf_page(session, &page));
        ref->page = page;
        goto done;
    }

    /*
     * There's an address, read or map the backing disk page and build an
     * in-memory version of the page.
     */
    WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
    WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
                           WT_DATA_IN_ITEM(&tmp) ?
                           WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));

    /*
     * Clear the local reference to an allocated copy of the disk image on
     * return; the page steals it, errors in this code should not free it.
     */
    tmp.mem = NULL;

    /*
     * If reading for a checkpoint, there's no additional work to do, the
     * page on disk is correct as written.
     */
    if (session->dhandle->checkpoint != NULL)
        goto done;

    /* If the page was deleted, instantiate that information. */
    if (previous_state == WT_REF_DELETED)
        WT_ERR(__wt_delete_page_instantiate(session, ref));

    /*
     * Instantiate updates from the database's lookaside table. The page
     * flag was set when the page was written, potentially a long time ago.
     * We only care if the lookaside table is currently active, check that
     * before doing any work.
     */
    dsk = tmp.data;
    if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) {
        WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside);
        WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside);

        WT_ERR(__las_page_instantiate(
                   session, ref, btree->id, addr, addr_size));
    }

done:
    WT_PUBLISH(ref->state, WT_REF_MEM);
    return (0);

err:	/*
	 * If the function building an in-memory version of the page failed,
	 * it discarded the page, but not the disk image.  Discard the page
	 * and separately discard the disk image in all cases.
	 */
    if (ref->page != NULL)
        __wt_ref_out(session, ref);
    WT_PUBLISH(ref->state, previous_state);

    __wt_buf_free(session, &tmp);

    return (ret);
}
コード例 #24
0
ファイル: bt_curnext.c プロジェクト: 7segments/mongo-1
/*
 * __wt_btcur_next --
 *	Move to the next record in the tree.
 */
int
__wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	uint32_t flags;
	int newpage;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_STAT_FAST_CONN_INCR(session, cursor_next);
	WT_STAT_FAST_DATA_INCR(session, cursor_next);

	flags = WT_READ_SKIP_INTL;			/* Tree walk flags. */
	if (truncating)
		LF_SET(WT_READ_TRUNCATE);

	WT_RET(__cursor_func_init(cbt, 0));

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT))
		__wt_btcur_iterate_setup(cbt, 1);

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the next page, until we reach the end of the
	 * file.
	 */
	for (newpage = 0;; newpage = 1) {
		page = cbt->ref == NULL ? NULL : cbt->ref->page;
		WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page));

		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_next(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_next(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
		} else if (page != NULL) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_next(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_next(cbt, newpage);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_next(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;

			/*
			 * The last page in a column-store has appended entries.
			 * We handle it separately from the usual cursor code:
			 * it's only that one page and it's in a simple format.
			 */
			if (page->type != WT_PAGE_ROW_LEAF &&
			    (cbt->ins_head = WT_COL_APPEND(page)) != NULL) {
				F_SET(cbt, WT_CBT_ITERATE_APPEND);
				continue;
			}
		}

		/*
		 * If we saw a lot of deleted records on this page, or we went
		 * all the way through a page and only saw deleted records, try
		 * to evict the page when we release it.  Otherwise repeatedly
		 * deleting from the beginning of a tree can have quadratic
		 * performance.  Take care not to force eviction of pages that
		 * are genuinely empty, in new trees.
		 */
		if (page != NULL &&
		    (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD ||
		    (newpage && cbt->page_deleted_count > 0)))
			__wt_page_evict_soon(page);
		cbt->page_deleted_count = 0;

		WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags));
		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
	}

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
コード例 #25
0
ファイル: cur_file.c プロジェクト: DavidAlphaFox/mongodb
/*
 * __wt_curfile_create --
 *	Open a cursor for a given btree handle.
 */
int
__wt_curfile_create(WT_SESSION_IMPL *session,
                    WT_CURSOR *owner, const char *cfg[], bool bulk, bool bitmap,
                    WT_CURSOR **cursorp)
{
    WT_CURSOR_STATIC_INIT(iface,
                          __wt_cursor_get_key,	/* get-key */
                          __wt_cursor_get_value,	/* get-value */
                          __wt_cursor_set_key,	/* set-key */
                          __wt_cursor_set_value,	/* set-value */
                          __curfile_compare,		/* compare */
                          __curfile_equals,		/* equals */
                          __curfile_next,		/* next */
                          __curfile_prev,		/* prev */
                          __curfile_reset,		/* reset */
                          __curfile_search,		/* search */
                          __curfile_search_near,	/* search-near */
                          __curfile_insert,		/* insert */
                          __curfile_update,		/* update */
                          __curfile_remove,		/* remove */
                          __wt_cursor_reconfigure,	/* reconfigure */
                          __curfile_close);		/* close */
    WT_BTREE *btree;
    WT_CONFIG_ITEM cval;
    WT_CURSOR *cursor;
    WT_CURSOR_BTREE *cbt;
    WT_CURSOR_BULK *cbulk;
    WT_DECL_RET;
    size_t csize;

    WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);

    cbt = NULL;

    btree = S2BT(session);
    WT_ASSERT(session, btree != NULL);

    csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
    WT_RET(__wt_calloc(session, 1, csize, &cbt));

    cursor = &cbt->iface;
    *cursor = iface;
    cursor->session = &session->iface;
    cursor->internal_uri = btree->dhandle->name;
    cursor->key_format = btree->key_format;
    cursor->value_format = btree->value_format;

    cbt->btree = btree;
    if (bulk) {
        F_SET(cursor, WT_CURSTD_BULK);

        cbulk = (WT_CURSOR_BULK *)cbt;

        /* Optionally skip the validation of each bulk-loaded key. */
        WT_ERR(__wt_config_gets_def(
                   session, cfg, "skip_sort_check", 0, &cval));
        WT_ERR(__wt_curbulk_init(
                   session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
    }

    /*
     * random_retrieval
     * Random retrieval cursors only support next, reset and close.
     */
    WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
    if (cval.val != 0) {
        __wt_cursor_set_notsup(cursor);
        cursor->next = __curfile_next_random;
        cursor->reset = __curfile_reset;
    }

    /* __wt_cursor_init is last so we don't have to clean up on error. */
    WT_ERR(__wt_cursor_init(
               cursor, cursor->internal_uri, owner, cfg, cursorp));

    WT_STAT_FAST_CONN_INCR(session, cursor_create);
    WT_STAT_FAST_DATA_INCR(session, cursor_create);

    if (0) {
err:
        __wt_free(session, cbt);
    }

    return (ret);
}