예제 #1
0
/*
 * __wt_lsm_tree_switch --
 *	Switch to a new in-memory tree.
 */
int
__wt_lsm_tree_switch(
    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	uint32_t new_id;

	new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); 
	WT_VERBOSE_RET(session, lsm, "Tree switch to: %d", new_id);

	if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) >
	    lsm_tree->chunk_alloc)
		WT_ERR(__wt_realloc(session,
		    &lsm_tree->chunk_alloc,
		    WT_MAX(10 * sizeof(*lsm_tree->chunk),
		    2 * lsm_tree->chunk_alloc),
		    &lsm_tree->chunk));

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = new_id;
	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	++lsm_tree->dsk_gen;
	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

err:	/* TODO: mark lsm_tree bad on error(?) */
	return (ret);
}
예제 #2
0
/*
 * __wt_lsm_tree_switch --
 *	Switch to a new in-memory tree.
 */
int
__wt_lsm_tree_switch(
    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk, **cp;
	uint32_t in_memory, new_id;

	new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); 

	if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) >
	    lsm_tree->chunk_alloc)
		WT_ERR(__wt_realloc(session,
		    &lsm_tree->chunk_alloc,
		    WT_MAX(10 * sizeof(*lsm_tree->chunk),
		    2 * lsm_tree->chunk_alloc),
		    &lsm_tree->chunk));

	/*
	 * In the steady state, we expect that the checkpoint worker thread
	 * will keep up with inserts.  If not, we throttle the insert rate to
	 * avoid filling the cache with in-memory chunks.  Threads sleep every
	 * 100 operations, so take that into account in the calculation.
	 */
	for (in_memory = 1, cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
	    in_memory < lsm_tree->nchunks && !F_ISSET(*cp, WT_LSM_CHUNK_ONDISK);
	    ++in_memory, --cp)
		;
	if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 2)
		lsm_tree->throttle_sleep = 0;
	else if (in_memory == lsm_tree->nchunks ||
	    F_ISSET(*cp, WT_LSM_CHUNK_STABLE)) {
		/*
		 * No checkpoint has completed this run.  Keep slowing down
		 * inserts until one does.
		 */
		lsm_tree->throttle_sleep =
		    WT_MAX(20, 2 * lsm_tree->throttle_sleep);
	} else {
		chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
		lsm_tree->throttle_sleep = (long)((in_memory - 2) *
		    WT_TIMEDIFF(chunk->create_ts, (*cp)->create_ts) /
		    (20 * in_memory * chunk->count));
	}

	WT_VERBOSE_ERR(session, lsm, "Tree switch to: %d, throttle %d",
	    new_id, (int)lsm_tree->throttle_sleep);

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = new_id;
	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	++lsm_tree->dsk_gen;
	F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

err:	/* TODO: mark lsm_tree bad on error(?) */
	return (ret);
}
예제 #3
0
/*
 * __wt_json_alloc_unpack --
 *	Allocate space for, and unpack an entry into JSON format.
 */
int
__wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer,
    size_t size, const char *fmt, WT_CURSOR_JSON *json,
    bool iskey, va_list ap)
{
	WT_CONFIG_ITEM *names;
	WT_DECL_RET;
	size_t needed;
	char **json_bufp;

	if (iskey) {
		names = &json->key_names;
		json_bufp = &json->key_buf;
	} else {
		names = &json->value_names;
		json_bufp = &json->value_buf;
	}
	needed = 0;
	WT_RET(__json_struct_size(session, buffer, size, fmt, names,
	    iskey, &needed));
	WT_RET(__wt_realloc(session, NULL, needed + 1, json_bufp));
	WT_RET(__json_struct_unpackv(session, buffer, size, fmt,
	    names, (u_char *)*json_bufp, needed + 1, iskey, ap));

	return (ret);
}
예제 #4
0
파일: meta_track.c 프로젝트: GYGit/mongo
/*
 * __meta_track_next --
 *	Extend the list of operations we're tracking, as necessary, and
 *	optionally return the next slot.
 */
static int
__meta_track_next(WT_SESSION_IMPL *session, WT_META_TRACK **trkp)
{
	size_t offset, sub_off;

	if (session->meta_track_next == NULL)
		session->meta_track_next = session->meta_track;

	offset = WT_PTRDIFF(session->meta_track_next, session->meta_track);
	sub_off = WT_PTRDIFF(session->meta_track_sub, session->meta_track);
	if (offset == session->meta_track_alloc) {
		WT_RET(__wt_realloc(session, &session->meta_track_alloc,
		    WT_MAX(2 * session->meta_track_alloc,
		    20 * sizeof(WT_META_TRACK)), &session->meta_track));

		/* Maintain positions in the new chunk of memory. */
		session->meta_track_next =
		    (uint8_t *)session->meta_track + offset;
		if (session->meta_track_sub != NULL)
			session->meta_track_sub =
			    (uint8_t *)session->meta_track + sub_off;
	}

	WT_ASSERT(session, session->meta_track_next != NULL);

	if (trkp != NULL) {
		*trkp = session->meta_track_next;
		session->meta_track_next = *trkp + 1;
	}

	return (0);
}
예제 #5
0
static int
__logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
{
	size_t needed;

	needed = __logrec_json_unpack_str(NULL, 0, item->data, item->size);
	WT_RET(__wt_realloc(session, NULL, needed, destp));
	(void)__logrec_json_unpack_str(*destp, needed, item->data, item->size);
	return (0);
}
예제 #6
0
/*
 * __lsm_copy_chunks --
 *	 Take a copy of part of the LSM tree chunk array so that we can work on
 *	 the contents without holding the LSM tree handle lock long term.
 */
static int
__lsm_copy_chunks(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie, bool old_chunks)
{
	WT_DECL_RET;
	u_int i, nchunks;
	size_t alloc;

	/* Always return zero chunks on error. */
	cookie->nchunks = 0;

	__wt_lsm_tree_readlock(session, lsm_tree);
	if (!lsm_tree->active) {
		__wt_lsm_tree_readunlock(session, lsm_tree);
		return (0);
	}

	/* Take a copy of the current state of the LSM tree. */
	nchunks = old_chunks ? lsm_tree->nold_chunks : lsm_tree->nchunks;
	alloc = old_chunks ? lsm_tree->old_alloc : lsm_tree->chunk_alloc;

	/*
	 * If the tree array of active chunks is larger than our current buffer,
	 * increase the size of our current buffer to match.
	 */
	if (cookie->chunk_alloc < alloc)
		WT_ERR(__wt_realloc(session,
		    &cookie->chunk_alloc, alloc, &cookie->chunk_array));
	if (nchunks > 0)
		memcpy(cookie->chunk_array,
		    old_chunks ? lsm_tree->old_chunks : lsm_tree->chunk,
		    nchunks * sizeof(*cookie->chunk_array));

	/*
	 * Mark each chunk as active, so we don't drop it until after we know
	 * it's safe.
	 */
	for (i = 0; i < nchunks; i++)
		(void)__wt_atomic_add32(&cookie->chunk_array[i]->refcnt, 1);

err:	__wt_lsm_tree_readunlock(session, lsm_tree);

	if (ret == 0)
		cookie->nchunks = nchunks;
	return (ret);
}
예제 #7
0
/*
 * __wt_buf_grow --
 *	Grow a buffer that's currently in-use.
 */
int
__wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
{
	size_t offset;
	int set_data;

	WT_ASSERT(session, size <= UINT32_MAX);

	/* Clear buffers previously used for mapped returns. */
	if (F_ISSET(buf, WT_ITEM_MAPPED))
		__wt_buf_clear(buf);

	if (size > buf->memsize) {
		/*
		 * Grow the buffer's memory: if the data reference is not set
		 * or references the buffer's memory, maintain it.
		 */
		WT_ASSERT(session, buf->mem == NULL || buf->memsize > 0);
		if (buf->data == NULL) {
			offset = 0;
			set_data = 1;
		} else if (buf->data >= buf->mem &&
		    WT_PTRDIFF(buf->data, buf->mem) < buf->memsize) {
			offset = WT_PTRDIFF(buf->data, buf->mem);
			set_data = 1;
		} else {
			offset = 0;
			set_data = 0;
		}

		if (F_ISSET(buf, WT_ITEM_ALIGNED))
			WT_RET(__wt_realloc_aligned(
			    session, &buf->memsize, size, &buf->mem));
		else
			WT_RET(__wt_realloc(
			    session, &buf->memsize, size, &buf->mem));

		if (set_data)
			buf->data = (uint8_t *)buf->mem + offset;
	}
	return (0);
}
예제 #8
0
/*
 * __wt_conn_foc_add --
 *	Add a new entry into the connection's free-on-close list.
 */
static int
__wt_conn_foc_add(WT_SESSION_IMPL *session, ...)
{
    WT_CONNECTION_IMPL *conn;
    va_list ap;
    size_t cnt;
    void *p;

    conn = S2C(session);

    /*
     * Instead of using locks to protect configuration information, assume
     * we can atomically update a pointer to a chunk of memory, and because
     * a pointer is never partially written, readers will correctly see the
     * original or new versions of the memory.  Readers might be using the
     * old version as it's being updated, though, which means we cannot free
     * the old chunk of memory until all possible readers have finished.
     * Currently, that's on connection close: in other words, we can use
     * this because it's small amounts of memory, and we really, really do
     * not want to acquire locks every time we access configuration strings,
     * since that's done on every API call.
     *
     * Our caller is expected to be holding any locks we need.
     */
    /* Count the slots. */
    va_start(ap, session);
    for (cnt = 0; va_arg(ap, void *) != NULL; ++cnt)
        ;
    va_end(ap);

    if (conn->foc_cnt + cnt >= conn->foc_size) {
        WT_RET(__wt_realloc(session, NULL,
                            (conn->foc_size + cnt + 20) * sizeof(void *), &conn->foc));
        conn->foc_size += cnt + 20;
    }
    va_start(ap, session);
    while ((p = va_arg(ap, void *)) != NULL)
        conn->foc[conn->foc_cnt++] = p;
    va_end(ap);
    return (0);
}
예제 #9
0
/*
 * __rec_track_extend --
 *	Extend the list of objects we're tracking
 */
static int
__rec_track_extend(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_PAGE_MODIFY *mod;
	size_t bytes_allocated;

	mod = page->modify;

	/*
	 * The __wt_realloc() function uses the "bytes allocated" value
	 * to figure out how much of the memory it needs to clear (see
	 * the function for an explanation of why the memory is cleared,
	 * it's a security thing).  We can calculate the bytes allocated
	 * so far, which saves a size_t in the WT_PAGE_MODIFY structure.
	 * That's worth a little dance, we have one of them per modified
	 * page.
	 */
	bytes_allocated = mod->track_entries * sizeof(*mod->track);
	WT_RET(__wt_realloc(session, &bytes_allocated,
	    (mod->track_entries + 20) * sizeof(*mod->track), &mod->track));
	mod->track_entries += 20;
	return (0);
}
예제 #10
0
/*
 * __hazard_exclusive --
 *	Request exclusive access to a page.
 */
static int
__hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top)
{
	/*
	 * Make sure there is space to track exclusive access so we can unlock
	 * to clean up.
	 */
	if (session->excl_next * sizeof(WT_REF *) == session->excl_allocated)
		WT_RET(__wt_realloc(session, &session->excl_allocated,
		    (session->excl_next + 50) * sizeof(WT_REF *),
		    &session->excl));

	/*
	 * Hazard pointers are acquired down the tree, which means we can't
	 * deadlock.
	 *
	 * Request exclusive access to the page.  The top-level page should
	 * already be in the locked state, lock child pages in memory.
	 * If another thread already has this page, give up.
	 */
	if (!top && !WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_LOCKED))
		return (EBUSY);	/* We couldn't change the state. */
	WT_ASSERT(session, ref->state == WT_REF_LOCKED);

	session->excl[session->excl_next++] = ref;

	/* Check for a matching hazard pointer. */
	if (__wt_page_hazard_check(session, ref->page) == NULL)
		return (0);

	WT_DSTAT_INCR(session, cache_eviction_hazard);
	WT_CSTAT_INCR(session, cache_eviction_hazard);

	WT_VERBOSE_RET(
	    session, evict, "page %p hazard request failed", ref->page);
	return (EBUSY);
}
예제 #11
0
/*尝试重分配ITEM中的mem*/
int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
{
	size_t offset;
	int copy_data;	/*需要拷贝数据的长度*/

	/*判断buf->data是否在buf->mem上,如果在,不用做数据拷贝*/
	if(WT_DATA_IN_ITEM(buf)){
		offset =WT_PTRDIFF(buf->data, buf->mem);
		copy_data = 0;
	}
	else{
		offset = 0;
		copy_data = buf->size ? 1 : 0;
	}

	/*进行内存重分配*/
	if(size > buf->memsize){
		if (F_ISSET(buf, WT_ITEM_ALIGNED))
			WT_RET(__wt_realloc_aligned(session, &buf->memsize, size, &buf->mem));
		else
			WT_RET(__wt_realloc(session, &buf->memsize, size, &buf->mem));
	}

	if(buf->data == NULL){
		buf->data = buf->mem;
		buf->size = 0;
	}
	else{
		if (copy_data) /*进行数据拷贝*/
			memcpy(buf->mem, buf->data, buf->size);

		buf->data = (uint8_t *)buf->mem + offset;
	}

	return 0;
}
예제 #12
0
/*
 * __wt_realloc_aligned --
 *	ANSI realloc function that aligns to buffer boundaries, configured with
 *	the "buffer_alignment" key to wiredtiger_open.
 */
int
__wt_realloc_aligned(WT_SESSION_IMPL *session,
    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
{
#if defined(HAVE_POSIX_MEMALIGN)
	WT_DECL_RET;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	if (session != NULL && S2C(session)->buffer_alignment > 0) {
		void *p, *newp;
		size_t bytes_allocated;

		/*
		 * Sometimes we're allocating memory and we don't care about the
		 * final length -- bytes_allocated_ret may be NULL.
		 */
		p = *(void **)retp;
		bytes_allocated =
		    (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
		WT_ASSERT(session,
		    (p == NULL && bytes_allocated == 0) ||
		    (p != NULL &&
		    (bytes_allocated_ret == NULL || bytes_allocated != 0)));
		WT_ASSERT(session, bytes_to_allocate != 0);
		WT_ASSERT(session, bytes_allocated < bytes_to_allocate);

		/*
		 * We are going to allocate an aligned buffer.  When we do this
		 * repeatedly, the allocator is expected to start on a boundary
		 * each time, account for that additional space by never asking
		 * for less than a full alignment size.  The primary use case
		 * for aligned buffers is Linux direct I/O, which requires that
		 * the size be a multiple of the alignment anyway.
		 */
		bytes_to_allocate =
		    WT_ALIGN(bytes_to_allocate, S2C(session)->buffer_alignment);

		WT_STAT_FAST_CONN_INCR(session, memory_allocation);

		if ((ret = posix_memalign(&newp,
		    S2C(session)->buffer_alignment,
		    bytes_to_allocate)) != 0)
			WT_RET_MSG(session, ret, "memory allocation");

		if (p != NULL)
			memcpy(newp, p, bytes_allocated);
		__wt_free(session, p);
		p = newp;

		/* Clear the allocated memory (see above). */
		memset((uint8_t *)p + bytes_allocated, 0,
		    bytes_to_allocate - bytes_allocated);

		/* Update caller's bytes allocated value. */
		if (bytes_allocated_ret != NULL)
			*bytes_allocated_ret = bytes_to_allocate;

		*(void **)retp = p;
		return (0);
	}
#endif
	/*
	 * If there is no posix_memalign function, or no alignment configured,
	 * fall back to realloc.
	 *
	 * Windows note: Visual C CRT memalign does not match Posix behavior
	 * and would also double each allocation so it is bad for memory use
	 */
	return (__wt_realloc(
	    session, bytes_allocated_ret, bytes_to_allocate, retp));
}
예제 #13
0
/*
 * __wt_meta_ckptlist_get --
 *	Load all available checkpoint information for a file.
 */
int
__wt_meta_ckptlist_get(
    WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep)
{
	WT_CKPT *ckpt, *ckptbase;
	WT_CONFIG ckptconf;
	WT_CONFIG_ITEM a, k, v;
	WT_DECL_RET;
	WT_ITEM *buf;
	size_t allocated, slot;
	const char *config;
	char timebuf[64];

	*ckptbasep = NULL;

	buf = NULL;
	ckptbase = NULL;
	allocated = slot = 0;
	config = NULL;

	/* Retrieve the metadata information for the file. */
	WT_RET(__wt_metadata_read(session, fname, &config));

	/* Load any existing checkpoints into the array. */
	WT_ERR(__wt_scr_alloc(session, 0, &buf));
	if (__wt_config_getones(session, config, "checkpoint", &v) == 0 &&
	    __wt_config_subinit(session, &ckptconf, &v) == 0)
		for (; __wt_config_next(&ckptconf, &k, &v) == 0; ++slot) {
			if (slot * sizeof(WT_CKPT) == allocated)
				WT_ERR(__wt_realloc(session, &allocated,
				    (slot + 50) * sizeof(WT_CKPT), &ckptbase));
			ckpt = &ckptbase[slot];

			/*
			 * Copy the name, address (raw and hex), order and time
			 * into the slot.  If there's no address, it's a fake.
			 */
			WT_ERR(
			    __wt_strndup(session, k.str, k.len, &ckpt->name));

			WT_ERR(__wt_config_subgets(session, &v, "addr", &a));
			WT_ERR(
			    __wt_buf_set(session, &ckpt->addr, a.str, a.len));
			if (a.len == 0)
				F_SET(ckpt, WT_CKPT_FAKE);
			else
				WT_ERR(__wt_nhex_to_raw(
				    session, a.str, a.len, &ckpt->raw));

			WT_ERR(__wt_config_subgets(session, &v, "order", &a));
			if (a.val == 0)
				goto format;
			ckpt->order = a.val;

			WT_ERR(__wt_config_subgets(session, &v, "time", &a));
			if (a.len == 0)
				goto format;
			if (a.len > sizeof(timebuf) - 1)
				goto format;
			memcpy(timebuf, a.str, a.len);
			timebuf[a.len] = '\0';
			if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1)
				goto format;

			WT_ERR(__wt_config_subgets(session, &v, "size", &a));
			ckpt->ckpt_size = (uint64_t)a.val;
		}

	/*
	 * Allocate an extra slot for a new value, plus a slot to mark the end.
	 *
	 * This isn't very clean, but there's necessary cooperation between the
	 * schema layer (that maintains the list of checkpoints), the btree
	 * layer (that knows when the root page is written, creating a new
	 * checkpoint), and the block manager (which actually creates the
	 * checkpoint).  All of that cooperation is handled in the WT_CKPT
	 * structure referenced from the WT_BTREE structure.
	 */
	if ((slot + 2) * sizeof(WT_CKPT) > allocated)
		WT_ERR(__wt_realloc(session, &allocated,
		    (slot + 2) * sizeof(WT_CKPT), &ckptbase));

	/* Sort in creation-order. */
	qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order);

	/* Return the array to our caller. */
	*ckptbasep = ckptbase;

	if (0) {
format:		WT_ERR_MSG(session, WT_ERROR, "corrupted checkpoint list");
err:		__wt_meta_ckptlist_free(session, ckptbase);
	}
	__wt_free(session, config);
	__wt_scr_free(&buf);

	return (ret);
}
예제 #14
0
/*
 * __wt_realloc_aligned --
 *	ANSI realloc function that aligns to buffer boundaries, configured with
 *	the "buffer_alignment" key to wiredtiger_open.
 */
int
__wt_realloc_aligned(WT_SESSION_IMPL *session,
    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
{
#if defined(HAVE_POSIX_MEMALIGN)
	WT_DECL_RET;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	if (session != NULL && S2C(session)->buffer_alignment > 0) {
		void *p, *newp;
		size_t bytes_allocated;

		/*
		 * Sometimes we're allocating memory and we don't care about the
		 * final length -- bytes_allocated_ret may be NULL.
		 */
		p = *(void **)retp;
		bytes_allocated =
		    (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
		WT_ASSERT(session,
		    (p == NULL && bytes_allocated == 0) ||
		    (p != NULL &&
		    (bytes_allocated_ret == NULL || bytes_allocated != 0)));
		WT_ASSERT(session, bytes_to_allocate != 0);
		WT_ASSERT(session, bytes_allocated < bytes_to_allocate);

		if (session != NULL)
			WT_STAT_FAST_CONN_INCR(session, memory_allocation);

		if ((ret = posix_memalign(&newp,
		    S2C(session)->buffer_alignment,
		    bytes_to_allocate)) != 0)
			WT_RET_MSG(session, ret, "memory allocation");

		if (p != NULL)
			memcpy(newp, p, bytes_allocated);
		__wt_free(session, p);
		p = newp;

		/* Clear the allocated memory (see above). */
		memset((uint8_t *)p + bytes_allocated, 0,
		    bytes_to_allocate - bytes_allocated);

		/* Update caller's bytes allocated value. */
		if (bytes_allocated_ret != NULL)
			*bytes_allocated_ret = bytes_to_allocate;

		*(void **)retp = p;
		return (0);
	}
#endif
	/*
	 * If there is no posix_memalign function, or no alignment configured,
	 * fall back to realloc.
	 *
	 * Windows note: Visual C CRT memalign does not match Posix behavior
	 * and would also double each allocation so it is bad for memory use
	 */
	return (__wt_realloc(
	    session, bytes_allocated_ret, bytes_to_allocate, retp));
}
예제 #15
0
파일: cur_log.c 프로젝트: brianleepzx/mongo
/*
 * __curlog_kv --
 *	Set the key and value of the log cursor to return to the user.
 */
static int
__curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
{
	WT_CURSOR_LOG *cl;
	WT_ITEM item;
	uint32_t fileid, key_count, opsize, optype;

	cl = (WT_CURSOR_LOG *)cursor;
	/*
	 * If it is a commit and we have stepped over the header, peek to get
	 * the size and optype and read out any key/value from this operation.
	 */
	if ((key_count = cl->step_count++) > 0) {
		WT_RET(__wt_logop_read(session,
		    &cl->stepp, cl->stepp_end, &optype, &opsize));
		WT_RET(__curlog_op_read(session, cl, optype, opsize, &fileid));
		/* Position on the beginning of the next record part. */
		cl->stepp += opsize;
	} else {
		optype = WT_LOGOP_INVALID;
		fileid = 0;
		cl->opkey->data = NULL;
		cl->opkey->size = 0;
		/*
		 * Non-commit records we want to return the record without the
		 * header and the adjusted size.  Add one to skip over the type
		 * which is normally consumed by __wt_logrec_read.
		 */
		cl->opvalue->data = WT_LOG_SKIP_HEADER(cl->logrec->data) + 1;
		cl->opvalue->size = WT_LOG_REC_SIZE(cl->logrec->size) - 1;
	}
	/*
	 * The log cursor sets the LSN and step count as the cursor key and
	 * and log record related data in the value.  The data in the value
	 * contains any operation key/value that was in the log record.
	 * For the special case that the caller needs the result in raw form,
	 * we create packed versions of the key/value.
	 */
	if (FLD_ISSET(cursor->flags, WT_CURSTD_RAW)) {
		memset(&item, 0, sizeof(item));
		WT_RET(wiredtiger_struct_size((WT_SESSION *)session,
		    &item.size, WT_LOGC_KEY_FORMAT, cl->cur_lsn->l.file,
		    cl->cur_lsn->l.offset, key_count));
		WT_RET(__wt_realloc(session, NULL, item.size, &cl->packed_key));
		item.data = cl->packed_key;
		WT_RET(wiredtiger_struct_pack((WT_SESSION *)session,
		    cl->packed_key, item.size, WT_LOGC_KEY_FORMAT,
		    cl->cur_lsn->l.file, cl->cur_lsn->l.offset, key_count));
		__wt_cursor_set_key(cursor, &item);

		WT_RET(wiredtiger_struct_size((WT_SESSION *)session,
		    &item.size, WT_LOGC_VALUE_FORMAT, cl->txnid, cl->rectype,
		    optype, fileid, cl->opkey, cl->opvalue));
		WT_RET(__wt_realloc(session, NULL, item.size,
		    &cl->packed_value));
		item.data = cl->packed_value;
		WT_RET(wiredtiger_struct_pack((WT_SESSION *)session,
		    cl->packed_value, item.size, WT_LOGC_VALUE_FORMAT,
		    cl->txnid, cl->rectype, optype, fileid, cl->opkey,
		    cl->opvalue));
		__wt_cursor_set_value(cursor, &item);
	} else {
		__wt_cursor_set_key(cursor, cl->cur_lsn->l.file,
		    cl->cur_lsn->l.offset, key_count);
		__wt_cursor_set_value(cursor, cl->txnid, cl->rectype, optype,
		    fileid, cl->opkey, cl->opvalue);
	}
	return (0);
}
예제 #16
0
int
__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_DECL_RET;
	WT_ITEM *buf, **p, **best, **slot;
	size_t allocated;
	u_int i;

	/* Don't risk the caller not catching the error. */
	*scratchp = NULL;

	/*
	 * Each WT_SESSION_IMPL has an array of scratch buffers available for
	 * use by any function.  We use WT_ITEM structures for scratch memory
	 * because we already have functions that do variable-length allocation
	 * on a WT_ITEM.  Scratch buffers are allocated only by a single thread
	 * of control, so no locking is necessary.
	 *
	 * Walk the array, looking for a buffer we can use.
	 */
	for (i = 0, best = slot = NULL,
	    p = session->scratch; i < session->scratch_alloc; ++i, ++p) {
		/* If we find an empty slot, remember it. */
		if ((buf = *p) == NULL) {
			if (slot == NULL)
				slot = p;
			continue;
		}

		if (F_ISSET(buf, WT_ITEM_INUSE))
			continue;

		/*
		 * If we find a buffer that's not in-use, check its size: we
		 * want the smallest buffer larger than the requested size,
		 * or the largest buffer if none are large enough.
		 */
		if (best == NULL ||
		    (buf->memsize <= size && buf->memsize > (*best)->memsize) ||
		    (buf->memsize >= size && buf->memsize < (*best)->memsize))
			best = p;

		/* If we find a perfect match, use it. */
		if ((*best)->memsize == size)
			break;
	}

	/*
	 * If we didn't find a free buffer, extend the array and use the first
	 * slot we allocated.
	 */
	if (best == NULL && slot == NULL) {
		allocated = session->scratch_alloc * sizeof(WT_ITEM *);
		WT_ERR(__wt_realloc(session, &allocated,
		    (session->scratch_alloc + 10) * sizeof(WT_ITEM *),
		    &session->scratch));
#ifdef HAVE_DIAGNOSTIC
		allocated = session->scratch_alloc * sizeof(WT_SCRATCH_TRACK);
		WT_ERR(__wt_realloc(session, &allocated,
		    (session->scratch_alloc + 10) * sizeof(WT_SCRATCH_TRACK),
		    &session->scratch_track));
#endif
		slot = session->scratch + session->scratch_alloc;
		session->scratch_alloc += 10;
	}

	/*
	 * If slot is non-NULL, we found an empty slot, try to allocate a
	 * buffer.
	 */
	if (best == NULL) {
		WT_ASSERT(session, slot != NULL);
		best = slot;

		WT_ERR(__wt_calloc_one(session, best));

		/* Scratch buffers must be aligned. */
		F_SET(*best, WT_ITEM_ALIGNED);
	}

	/* Grow the buffer as necessary and return. */
	session->scratch_cached -= (*best)->memsize;
	WT_ERR(__wt_buf_init(session, *best, size));
	F_SET(*best, WT_ITEM_INUSE);

#ifdef HAVE_DIAGNOSTIC
	session->scratch_track[best - session->scratch].file = file;
	session->scratch_track[best - session->scratch].line = line;
#endif

	*scratchp = *best;
	return (0);

err:	WT_RET_MSG(session, ret,
	    "session unable to allocate a scratch buffer");
}
예제 #17
0
/*
 * __wt_struct_repack --
 *	Return the subset of the packed buffer that represents part of
 *	the format.  If the result is not contiguous in the existing
 *	buffer, a buffer is reallocated and filled.
 */
int
__wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt,
    const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf,
    void **reallocp)
{
	WT_DECL_PACK_VALUE(pvin);
	WT_DECL_PACK_VALUE(pvout);
	WT_DECL_RET;
	WT_PACK packin, packout;
	const uint8_t *before, *end, *p;
	uint8_t *newbuf, *pout;
	size_t len;
	const void *start;

	start = newbuf = NULL;
	p = inbuf->data;
	end = p + inbuf->size;

	/*
	 * Handle this non-contiguous case: 'U' -> 'u' at the end of the buf.
	 * The former case has the size embedded before the item, the latter
	 * does not.
	 */
	if ((len = strlen(outfmt)) > 1 && outfmt[len - 1] == 'u' &&
	    strlen(infmt) > len && infmt[len - 1] == 'U') {
		WT_ERR(__wt_realloc(session, NULL, inbuf->size, reallocp));
		pout = *reallocp;
	} else
		pout = NULL;

	WT_ERR(__pack_init(session, &packout, outfmt));
	WT_ERR(__pack_init(session, &packin, infmt));

	/* Outfmt should complete before infmt */
	while ((ret = __pack_next(&packout, &pvout)) == 0) {
		WT_ERR(__pack_next(&packin, &pvin));
		before = p;
		WT_ERR(__unpack_read(session, &pvin, &p, (size_t)(end - p)));
		if (pvout.type != pvin.type) {
			if (pvout.type == 'u' && pvin.type == 'U') {
				/* Skip the prefixed size, we don't need it */
				WT_ERR(__wt_struct_unpack_size(session, before,
				    (size_t)(end - before), "I", &len));
				before += len;
			} else
				WT_ERR(ENOTSUP);
		}
		if (pout != NULL) {
			memcpy(pout, before, WT_PTRDIFF(p, before));
			pout += p - before;
		} else if (start == NULL)
			start = before;
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* Be paranoid - __pack_write should never overflow. */
	WT_ASSERT(session, p <= end);

	if (pout != NULL) {
		outbuf->data = *reallocp;
		outbuf->size = WT_PTRDIFF(pout, *reallocp);
	} else {
		outbuf->data = start;
		outbuf->size = WT_PTRDIFF(p, start);
	}

err:	return (ret);
}
예제 #18
0
/*
 * __wt_realloc_aligned --
 *	ANSI realloc function that aligns to buffer boundaries, configured with
 *	the "buffer_alignment" key to wiredtiger_open.
 */
int
__wt_realloc_aligned(WT_SESSION_IMPL *session,
    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
{
#if defined(HAVE_POSIX_MEMALIGN)
	int ret;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	if (session != NULL && S2C(session)->buffer_alignment > 0) {
		void *p, *newp;
		size_t bytes_allocated;

		WT_ASSERT(session, bytes_to_allocate != 0);

		/*
		 * Sometimes we're allocating memory and we don't care about the
		 * final length -- bytes_allocated_ret may be NULL.
		 */
		bytes_allocated = (bytes_allocated_ret == NULL) ?
		    0 : *bytes_allocated_ret;
		WT_ASSERT(session, bytes_allocated < bytes_to_allocate);

		p = *(void **)retp;

		WT_ASSERT(session, p == NULL || bytes_allocated != 0);

		if (p == NULL && session != NULL && S2C(session)->stats != NULL)
			WT_CSTAT_INCR(session, memalloc);

		if ((ret = posix_memalign(&newp,
		    S2C(session)->buffer_alignment,
		    bytes_to_allocate)) != 0)
			WT_RET_MSG(session, ret, "memory allocation");

		if (p != NULL)
			memcpy(newp, p, bytes_allocated);
		__wt_free(session, p);
		p = newp;

		/* Clear the allocated memory (see above). */
		memset((uint8_t *)p + bytes_allocated, 0,
		    bytes_to_allocate - bytes_allocated);

		/* Update caller's bytes allocated value. */
		if (bytes_allocated_ret != NULL)
			*bytes_allocated_ret = bytes_to_allocate;

		*(void **)retp = p;
		return (0);
	}
#endif
	/*
	 * If there is no posix_memalign function, or no alignment configured,
	 * fall back to realloc.
	 */
	return (__wt_realloc(
	    session, bytes_allocated_ret, bytes_to_allocate, retp));
}
예제 #19
0
/*
 * __thread_group_resize --
 *	Resize an array of utility threads already holding the lock.
 */
static int
__thread_group_resize(
    WT_SESSION_IMPL *session, WT_THREAD_GROUP *group,
    uint32_t new_min, uint32_t new_max, uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_THREAD *thread;
	size_t alloc;
	uint32_t i, session_flags;

	conn = S2C(session);
	session_flags = 0;

	WT_ASSERT(session,
	    group->current_threads <= group->alloc &&
	    __wt_rwlock_islocked(session, group->lock));

	if (new_min == group->min && new_max == group->max)
		return (0);

	/*
	 * Coll shrink to reduce the number of thread structures and running
	 * threads if required by the change in group size.
	 */
	WT_RET(__thread_group_shrink(session, group, new_max));

	/*
	 * Only reallocate the thread array if it is the largest ever, since
	 * our realloc doesn't support shrinking the allocated size.
	 */
	if (group->alloc < new_max) {
		alloc = group->alloc * sizeof(*group->threads);
		WT_RET(__wt_realloc(session, &alloc,
		    new_max * sizeof(*group->threads), &group->threads));
		group->alloc = new_max;
	}

	/*
	 * Initialize the structures based on the previous group size, not
	 * the previous allocated size.
	 */
	for (i = group->max; i < new_max; i++) {
		WT_ERR(__wt_calloc_one(session, &thread));
		/*
		 * Threads get their own session and lookaside table cursor
		 * if the lookaside table is open. Note that threads are
		 * started during recovery, before the lookaside table is
		 * created.
		 */
		if (LF_ISSET(WT_THREAD_CAN_WAIT))
			session_flags = WT_SESSION_CAN_WAIT;
		if (F_ISSET(conn, WT_CONN_LAS_OPEN))
			FLD_SET(session_flags, WT_SESSION_LOOKASIDE_CURSOR);
		WT_ERR(__wt_open_internal_session(conn, group->name,
		    false, session_flags, &thread->session));
		if (LF_ISSET(WT_THREAD_PANIC_FAIL))
			F_SET(thread, WT_THREAD_PANIC_FAIL);
		thread->id = i;
		thread->run_func = group->run_func;
		WT_ASSERT(session, group->threads[i] == NULL);
		group->threads[i] = thread;
	}

	if (group->current_threads < new_min)
		WT_ERR(__thread_group_grow(session, group, new_min));

err:	/*
	 * Update the thread group information even on failure to improve our
	 * chances of cleaning up properly.
	 */
	group->max = new_max;
	group->min = new_min;

	/*
	 * An error resizing a thread array is fatal, it should only happen
	 * in an out of memory situation.
	 */
	if (ret != 0) {
		WT_TRET(__wt_thread_group_destroy(session, group));
		WT_PANIC_RET(session, ret, "Error while resizing thread group");
	}
	return (ret);
}
예제 #20
0
/*
 * __wt_schema_open_indices --
 *	Open the indices for a table.
 */
int
__wt_schema_open_index(WT_SESSION_IMPL *session,
    WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
{
	WT_CURSOR *cursor;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_INDEX *idx;
	u_int i;
	int cmp, match;
	const char *idxconf, *name, *tablename, *uri;

	/* Check if we've already done the work. */
	if (idxname == NULL && table->idx_complete)
		return (0);

	cursor = NULL;
	idx = NULL;

	/* Build a search key. */
	tablename = table->name;
	(void)WT_PREFIX_SKIP(tablename, "table:");
	WT_ERR(__wt_scr_alloc(session, 512, &tmp));
	WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename));

	/* Find matching indices. */
	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
	cursor->set_key(cursor, tmp->data);
	if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
		ret = cursor->next(cursor);
	for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor, &uri));
		name = uri;
		if (!WT_PREFIX_SKIP(name, tmp->data))
			break;

		/* Is this the index we are looking for? */
		match = idxname == NULL || WT_STRING_MATCH(name, idxname, len);

		/*
		 * Ensure there is space, including if we have to make room for
		 * a new entry in the middle of the list.
		 */
		if (table->idx_alloc <= sizeof(WT_INDEX *) *
		    ((size_t)WT_MAX(i, table->nindices) + 1))
			WT_ERR(__wt_realloc(session, &table->idx_alloc,
			    WT_MAX(10 * sizeof(WT_INDEX *),
			    2 * table->idx_alloc), &table->indices));

		/* Keep the in-memory list in sync with the metadata. */
		cmp = 0;
		while (table->indices[i] != NULL &&
		    (cmp = strcmp(uri, table->indices[i]->name)) > 0) {
			/* Index no longer exists, remove it. */
			__wt_free(session, table->indices[i]);
			memmove(&table->indices[i], &table->indices[i + 1],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[--table->nindices] = NULL;
		}
		if (cmp < 0) {
			/* Make room for a new index. */
			memmove(&table->indices[i + 1], &table->indices[i],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[i] = NULL;
			++table->nindices;
		}

		if (!match)
			continue;

		if (table->indices[i] == NULL) {
			WT_ERR(cursor->get_value(cursor, &idxconf));
			WT_ERR(__wt_calloc_def(session, 1, &idx));
			WT_ERR(__wt_strdup(session, uri, &idx->name));
			WT_ERR(__wt_strdup(session, idxconf, &idx->config));
			WT_ERR(__open_index(session, table, idx));

			table->indices[i] = idx;
			idx = NULL;
		}

		/* If we were looking for a single index, we're done. */
		if (indexp != NULL)
			*indexp = table->indices[i];
		if (idxname != NULL)
			break;
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* If we did a full pass, we won't need to do it again. */
	if (idxname == NULL) {
		table->nindices = i;
		table->idx_complete = 1;
	}

err:	__wt_scr_free(&tmp);
	if (idx != NULL)
		__wt_schema_destroy_index(session, idx);
	if (cursor != NULL)
		WT_TRET(cursor->close(cursor));
	return (ret);
}