Exemple #1
0
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	int busy, force_attempts, oldgen;

	for (force_attempts = oldgen = 0, wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, attempt to read it.
			 * Make sure there is space in the cache.
			 */
			WT_RET(__wt_cache_full_check(session));
			WT_RET(__wt_cache_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory: get a hazard pointer, update
			 * the page's LRU and return.  The expected reason we
			 * can't get a hazard pointer is because the page is
			 * being evicted; yield and try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			page = ref->page;
			WT_ASSERT(session, page != NULL);

			/*
			 * Forcibly evict pages that are too big.
			 */
			if (force_attempts < 10 &&
			    __evict_force_check(session, page, flags)) {
				++force_attempts;
				ret = __wt_page_release_evict(session, ref);
				/* If forced eviction fails, stall. */
				if (ret == EBUSY) {
					ret = 0;
					wait_cnt += 1000;
					WT_STAT_FAST_CONN_INCR(session,
					    page_forcible_evict_blocked);
					break;
				} else
					WT_RET(ret);

				/*
				 * The result of a successful forced eviction
				 * is a page-state transition (potentially to
				 * an in-memory page we can use, or a restart
				 * return for our caller), continue the outer
				 * page-acquisition loop.
				 */
				continue;
			}

			/* Check if we need an autocommit transaction. */
			if ((ret = __wt_txn_autocommit_check(session)) != 0) {
				WT_TRET(__wt_hazard_clear(session, page));
				return (ret);
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen != WT_READGEN_OLDEST &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_set(session);

			return (0);
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (++wait_cnt < 1000)
			__wt_yield();
		else {
			sleep_cnt = WT_MIN(wait_cnt, 10000);
			wait_cnt *= 2;
			WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
			__wt_sleep(0, sleep_cnt);
		}
	}
}
Exemple #2
0
/*
 * __wt_lsm_meta_write --
 *	Write the metadata for an LSM tree.
 */
int
__wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	u_int i;
	int first;

	WT_RET(__wt_scr_alloc(session, 0, &buf));
	WT_ERR(__wt_buf_fmt(session, buf,
	    "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)",
	    lsm_tree->key_format, lsm_tree->value_format,
	    lsm_tree->bloom_config, lsm_tree->file_config));
	if (lsm_tree->collator_name != NULL)
		WT_ERR(__wt_buf_catfmt(
		    session, buf, ",collator=%s", lsm_tree->collator_name));
	WT_ERR(__wt_buf_catfmt(session, buf,
	    ",last=%" PRIu32
	    ",chunk_max=%" PRIu64
	    ",chunk_size=%" PRIu64
	    ",auto_throttle=%" PRIu32
	    ",merge_max=%" PRIu32
	    ",merge_min=%" PRIu32
	    ",bloom=%" PRIu32
	    ",bloom_bit_count=%" PRIu32
	    ",bloom_hash_count=%" PRIu32,
	    lsm_tree->last, lsm_tree->chunk_max, lsm_tree->chunk_size,
	    F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0,
	    lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count));
	WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=["));
	for (i = 0; i < lsm_tree->nchunks; i++) {
		chunk = lsm_tree->chunk[i];
		if (i > 0)
			WT_ERR(__wt_buf_catfmt(session, buf, ","));
		WT_ERR(__wt_buf_catfmt(session, buf, "id=%" PRIu32, chunk->id));
		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_buf_catfmt(session, buf, ",bloom"));
		if (chunk->size != 0)
			WT_ERR(__wt_buf_catfmt(session, buf,
			    ",chunk_size=%" PRIu64, chunk->size));
		if (chunk->count != 0)
			WT_ERR(__wt_buf_catfmt(
			    session, buf, ",count=%" PRIu64, chunk->count));
		WT_ERR(__wt_buf_catfmt(
		    session, buf, ",generation=%" PRIu32, chunk->generation));
	}
	WT_ERR(__wt_buf_catfmt(session, buf, "]"));
	WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=["));
	first = 1;
	for (i = 0; i < lsm_tree->nold_chunks; i++) {
		chunk = lsm_tree->old_chunks[i];
		WT_ASSERT(session, chunk != NULL);
		if (first)
			first = 0;
		else
			WT_ERR(__wt_buf_catfmt(session, buf, ","));
		WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri));
		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_buf_catfmt(
			    session, buf, ",bloom=\"%s\"", chunk->bloom_uri));
	}
	WT_ERR(__wt_buf_catfmt(session, buf, "]"));
	ret = __wt_metadata_update(session, lsm_tree->name, buf->data);
	WT_ERR(ret);

err:	__wt_scr_free(&buf);
	return (ret);
}
Exemple #3
0
/*
 * __txn_printlog --
 *	Print a log record in a human-readable format.
 */
static int
__txn_printlog(WT_SESSION_IMPL *session,
    WT_ITEM *rawrec, WT_LSN *lsnp, WT_LSN *next_lsnp,
    void *cookie, int firstrecord)
{
	WT_LOG_RECORD *logrec;
	WT_TXN_PRINTLOG_ARGS *args;
	const uint8_t *end, *p;
	const char *msg;
	uint64_t txnid;
	uint32_t fileid, lsnfile, lsnoffset, rectype;
	int32_t start;
	bool compressed;

	WT_UNUSED(next_lsnp);
	args = cookie;

	p = WT_LOG_SKIP_HEADER(rawrec->data);
	end = (const uint8_t *)rawrec->data + rawrec->size;
	logrec = (WT_LOG_RECORD *)rawrec->data;
	compressed = F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED);

	/* First, peek at the log record type. */
	WT_RET(__wt_logrec_read(session, &p, end, &rectype));

	if (!firstrecord)
		WT_RET(__wt_fprintf(session, WT_STDOUT(session), ",\n"));

	WT_RET(__wt_fprintf(session, WT_STDOUT(session),
	    "  { \"lsn\" : [%" PRIu32 ",%" PRIu32 "],\n",
	    lsnp->l.file, lsnp->l.offset));
	WT_RET(__wt_fprintf(session, WT_STDOUT(session),
	    "    \"hdr_flags\" : \"%s\",\n", compressed ? "compressed" : ""));
	WT_RET(__wt_fprintf(session, WT_STDOUT(session),
	    "    \"rec_len\" : %" PRIu32 ",\n", logrec->len));
	WT_RET(__wt_fprintf(session, WT_STDOUT(session),
	    "    \"mem_len\" : %" PRIu32 ",\n",
	    compressed ? logrec->mem_len : logrec->len));

	switch (rectype) {
	case WT_LOGREC_CHECKPOINT:
		WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
		    WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"type\" : \"checkpoint\",\n"));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"ckpt_lsn\" : [%" PRIu32 ",%" PRIu32 "]\n",
		    lsnfile, lsnoffset));
		break;

	case WT_LOGREC_COMMIT:
		WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"type\" : \"commit\",\n"));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"txnid\" : %" PRIu64 ",\n", txnid));
		WT_RET(__txn_oplist_printlog(session, &p, end, args->flags));
		break;

	case WT_LOGREC_FILE_SYNC:
		WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
		    WT_UNCHECKED_STRING(Ii), &fileid, &start));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"type\" : \"file_sync\",\n"));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"fileid\" : %" PRIu32 ",\n", fileid));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"start\" : %" PRId32 "\n", start));
		break;

	case WT_LOGREC_MESSAGE:
		WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
		    WT_UNCHECKED_STRING(S), &msg));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"type\" : \"message\",\n"));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"message\" : \"%s\"\n", msg));
		break;

	case WT_LOGREC_SYSTEM:
		WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
		    WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset));
		WT_RET(__wt_fprintf(session, WT_STDOUT(session),
		    "    \"type\" : \"system\",\n"));
		WT_RET(__txn_oplist_printlog(session, &p, end, args->flags));
		break;
	}

	WT_RET(__wt_fprintf(session, WT_STDOUT(session), "  }"));

	return (0);
}
Exemple #4
0
/*
 * __wt_curfile_create --
 *	Open a cursor for a given btree handle.
 */
int
__wt_curfile_create(WT_SESSION_IMPL *session,
    WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap,
    WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,	/* get-key */
	    __wt_cursor_get_value,	/* get-value */
	    __wt_cursor_set_key,	/* set-key */
	    __wt_cursor_set_value,	/* set-value */
	    __curfile_compare,		/* compare */
	    __curfile_equals,		/* equals */
	    __curfile_next,		/* next */
	    __curfile_prev,		/* prev */
	    __curfile_reset,		/* reset */
	    __curfile_search,		/* search */
	    __curfile_search_near,	/* search-near */
	    __curfile_insert,		/* insert */
	    __curfile_update,		/* update */
	    __curfile_remove,		/* remove */
	    __wt_cursor_reconfigure,	/* reconfigure */
	    __curfile_close);		/* close */
	WT_BTREE *btree;
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE *cbt;
	WT_CURSOR_BULK *cbulk;
	WT_DECL_RET;
	size_t csize;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);

	cbt = NULL;

	btree = S2BT(session);
	WT_ASSERT(session, btree != NULL);

	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
	WT_RET(__wt_calloc(session, 1, csize, &cbt));

	cursor = &cbt->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->internal_uri = btree->dhandle->name;
	cursor->key_format = btree->key_format;
	cursor->value_format = btree->value_format;
	cbt->btree = btree;

	if (bulk) {
		F_SET(cursor, WT_CURSTD_BULK);

		cbulk = (WT_CURSOR_BULK *)cbt;

		/* Optionally skip the validation of each bulk-loaded key. */
		WT_ERR(__wt_config_gets_def(
		    session, cfg, "skip_sort_check", 0, &cval));
		WT_ERR(__wt_curbulk_init(
		    session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
	}

	/*
	 * random_retrieval
	 * Random retrieval cursors only support next, reset and close.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
	if (cval.val != 0) {
		__wt_cursor_set_notsup(cursor);
		cursor->next = __curfile_next_random;
		cursor->reset = __curfile_reset;
	}

	/* Underlying btree initialization. */
	__wt_btcur_open(cbt);

	/* __wt_cursor_init is last so we don't have to clean up on error. */
	WT_ERR(__wt_cursor_init(
	    cursor, cursor->internal_uri, owner, cfg, cursorp));

	WT_STAT_FAST_CONN_INCR(session, cursor_create);
	WT_STAT_FAST_DATA_INCR(session, cursor_create);

	if (0) {
err:		__wt_free(session, cbt);
	}

	return (ret);
}
Exemple #5
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *prev, *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
	uint64_t oldest_id, saved_pinned_id, time_start, time_stop;
	uint32_t flags;
	bool timer, tried_eviction;

	conn = S2C(session);
	btree = S2BT(session);
	prev = walk = NULL;
	txn = &session->txn;
	tried_eviction = false;
	time_start = time_stop = 0;

	/* Only visit pages in cache and don't bump page read generations. */
	flags = WT_READ_CACHE | WT_READ_NO_GEN;

	/*
	 * Skip all deleted pages.  For a page to be marked deleted, it must
	 * have been evicted from cache and marked clean.  Checkpoint should
	 * never instantiate deleted pages: if a truncate is not visible to the
	 * checkpoint, the on-disk version is correct.  If the truncate is
	 * visible, we skip over the child page when writing its parent.  We
	 * check whether a truncate is visible in the checkpoint as part of
	 * reconciling internal pages (specifically in __rec_child_modify).
	 */
	LF_SET(WT_READ_DELETED_SKIP);

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
	timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT);
	if (timer)
		time_start = __wt_clock(session);

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		/*
		 * Save the oldest transaction ID we need to keep around.
		 * Otherwise, in a busy system, we could be updating pages so
		 * fast that write leaves never catches up.  We deliberately
		 * have no transaction running at this point that would keep
		 * the oldest ID from moving forwards as we walk the tree.
		 */
		oldest_id = __wt_txn_oldest_id(session);

		LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL);
		for (;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write hot pages (defined as pages that have
			 * been updated since the write phase leaves started):
			 * checkpoint will have to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session,
				    walk, NULL, WT_REC_CHECKPOINT, NULL));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * If we are flushing a file at read-committed isolation, which
		 * is of particular interest for flushing the metadata to make
		 * a schema-changing operation durable, get a transactional
		 * snapshot now.
		 *
		 * All changes committed up to this point should be included.
		 * We don't update the snapshot in between pages because the
		 * metadata shouldn't have many pages.  Instead, read-committed
		 * isolation ensures that all metadata updates completed before
		 * the checkpoint are included.
		 */
		if (txn->isolation == WT_ISO_READ_COMMITTED)
			__wt_txn_get_snapshot(session);

		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * In the final checkpoint pass, child pages cannot be evicted
		 * from underneath internal pages nor can underlying blocks be
		 * freed until the checkpoint's block lists are stable. Also,
		 * we cannot split child pages into parents unless we know the
		 * final pass will write a consistent view of that namespace.
		 * Set the checkpointing flag to block such actions and wait for
		 * any problematic eviction or page splits to complete.
		 */
		WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF &&
		    btree->sync_session == NULL);

		btree->sync_session = session;
		btree->syncing = WT_BTREE_SYNC_WAIT;
		(void)__wt_gen_next_drain(session, WT_GEN_EVICT);
		btree->syncing = WT_BTREE_SYNC_RUNNING;

		/* Write all dirty in-cache pages. */
		LF_SET(WT_READ_NO_EVICT);

		/* Read pages with lookaside entries and evict them asap. */
		LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED);

		for (;;) {
			WT_ERR(__sync_dup_walk(session, walk, flags, &prev));
			WT_ERR(__wt_tree_walk(session, &walk, flags));

			if (walk == NULL)
				break;

			/*
			 * Skip clean pages, but need to make sure maximum
			 * transaction ID is always updated.
			 */
			if (!__wt_page_is_modified(walk->page)) {
				if (((mod = walk->page->modify) != NULL) &&
				    mod->rec_max_txn > btree->rec_max_txn)
					btree->rec_max_txn = mod->rec_max_txn;
#ifdef HAVE_TIMESTAMPS
				if (mod != NULL && __wt_timestamp_cmp(
				    &btree->rec_max_timestamp,
				    &mod->rec_max_timestamp) < 0)
					__wt_timestamp_set(
					    &btree->rec_max_timestamp,
					    &mod->rec_max_timestamp);
#endif
				continue;
			}

			/*
			 * Take a local reference to the page modify structure
			 * now that we know the page is dirty. It needs to be
			 * done in this order otherwise the page modify
			 * structure could have been created between taking the
			 * reference and checking modified.
			 */
			page = walk->page;

			/*
			 * Write dirty pages, if we can't skip them. If we skip
			 * a page, mark the tree dirty. The checkpoint marked it
			 * clean and we can't skip future checkpoints until this
			 * page is written.
			 */
			if (__sync_checkpoint_can_skip(session, page)) {
				__wt_tree_modify_set(session);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}

			/*
			 * If the page was pulled into cache by our read, try
			 * to evict it now.
			 *
			 * For eviction to have a chance, we first need to move
			 * the walk point to the next page checkpoint will
			 * visit.  We want to avoid this code being too special
			 * purpose, so try to reuse the ordinary eviction path.
			 *
			 * Regardless of whether eviction succeeds or fails,
			 * the walk continues from the previous location.  We
			 * remember whether we tried eviction, and don't try
			 * again.  Even if eviction fails (the page may stay in
			 * cache clean but with history that cannot be
			 * discarded), that is not wasted effort because
			 * checkpoint doesn't need to write the page again.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    page->read_gen == WT_READGEN_WONT_NEED &&
			    !tried_eviction) {
				WT_ERR_BUSY_OK(
				    __wt_page_release_evict(session, walk));
				walk = prev;
				prev = NULL;
				tried_eviction = true;
				continue;
			}
			tried_eviction = false;

			WT_ERR(__wt_reconcile(
			    session, walk, NULL, WT_REC_CHECKPOINT, NULL));

			/*
			 * Update checkpoint IO tracking data if configured
			 * to log verbose progress messages.
			 */
			if (conn->ckpt_timer_start.tv_sec > 0) {
				conn->ckpt_write_bytes +=
				    page->memory_footprint;
				++conn->ckpt_write_pages;

				/* Periodically log checkpoint progress. */
				if (conn->ckpt_write_pages % 5000 == 0)
					__wt_checkpoint_progress(
					    session, false);
			}
		}
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
		WT_ERR(__wt_illegal_value(session, syncop));
		break;
	}

	if (timer) {
		time_stop = __wt_clock(session);
		__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote: %" PRIu64
		    " leaf pages (%" PRIu64 "B), %" PRIu64
		    " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_pages, leaf_bytes, internal_pages, internal_bytes,
		    WT_CLOCKDIFF_MS(time_stop, time_start));
	}

err:	/* On error, clear any left-over tree walk. */
	WT_TRET(__wt_page_release(session, walk, flags));
	WT_TRET(__wt_page_release(session, prev, flags));

	/*
	 * If we got a snapshot in order to write pages, and there was no
	 * snapshot active when we started, release it.
	 */
	if (txn->isolation == WT_ISO_READ_COMMITTED &&
	    saved_pinned_id == WT_TXN_NONE)
		__wt_txn_release_snapshot(session);

	/* Clear the checkpoint flag. */
	btree->syncing = WT_BTREE_SYNC_OFF;
	btree->sync_session = NULL;

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 &&
	    syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
		WT_RET(btree->bm->sync(btree->bm, session, false));

	return (ret);
}
Exemple #6
0
/*
 * __wt_schema_project_out --
 *	Given list of cursors and a projection, read columns from the
 *	dependent cursors and return them to the application.
 */
int
__wt_schema_project_out(WT_SESSION_IMPL *session,
    WT_CURSOR **cp, const char *proj_arg, va_list ap)
{
	WT_CURSOR *c;
	WT_DECL_PACK(pack);
	WT_DECL_PACK_VALUE(pv);
	u_long arg;
	char *proj;
	uint8_t *p, *end;

	p = end = NULL;		/* -Wuninitialized */

	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			p = (uint8_t *)c->key.data;
			end = p + c->key.size;
			continue;

		case WT_PROJ_VALUE:
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			p = (uint8_t *)c->value.data;
			end = p + c->value.size;
			continue;
		}

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_NEXT:
			case WT_PROJ_SKIP:
			case WT_PROJ_REUSE:
				WT_RET(__pack_next(&pack, &pv));
				WT_RET(__unpack_read(session, &pv,
				    (const uint8_t **)&p, (size_t)(end - p)));
				/* Only copy the value out once. */
				if (*proj != WT_PROJ_NEXT)
					break;
				WT_UNPACK_PUT(session, pv, ap);
				break;
			}
		}
	}

	return (0);
}
Exemple #7
0
/*
 * __wt_schema_project_merge --
 *	Given list of cursors and a projection, build a buffer containing the
 *	column values read from the cursors.
 */
int
__wt_schema_project_merge(WT_SESSION_IMPL *session,
    WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value)
{
	WT_CURSOR *c;
	WT_ITEM *buf;
	WT_DECL_PACK(pack);
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_PACK_VALUE(vpv);
	WT_PACK vpack;
	u_long arg;
	char *proj;
	const uint8_t *p, *end;
	uint8_t *vp;
	size_t len;

	p = end = NULL;		/* -Wuninitialized */

	WT_RET(__wt_buf_init(session, value, 0));
	WT_RET(__pack_init(session, &vpack, vformat));

	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = buf->data;
			end = p + buf->size;
			continue;
		}

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_NEXT:
			case WT_PROJ_SKIP:
			case WT_PROJ_REUSE:
				WT_RET(__pack_next(&pack, &pv));
				WT_RET(__unpack_read(session, &pv,
				    &p, (size_t)(end - p)));
				/* Only copy the value out once. */
				if (*proj != WT_PROJ_NEXT)
					break;

				WT_RET(__pack_next(&vpack, &vpv));
				/* Make sure the types are compatible. */
				WT_ASSERT(session,
				    __wt_tolower((u_char)pv.type) ==
				    __wt_tolower((u_char)vpv.type));
				vpv.u = pv.u;
				len = __pack_size(session, &vpv);
				WT_RET(__wt_buf_grow(session,
				    value, value->size + len));
				vp = (uint8_t *)value->mem + value->size;
				WT_RET(__pack_write(session, &vpv, &vp, len));
				value->size += len;
				break;
			}
		}
	}

	return (0);
}
Exemple #8
0
/*
 * __curlog_op_read --
 *	Read out any key/value from an individual operation record
 *	in the log.  We're only interested in put and remove operations
 *	since truncate is not a cursor operation.  All successful
 *	returns from this function will have set up the cursor copy of
 *	key and value to give the user.
 */
static int
__curlog_op_read(WT_SESSION_IMPL *session,
    WT_CURSOR_LOG *cl, uint32_t optype, uint32_t opsize, uint32_t *fileid)
{
	WT_ITEM key, value;
	uint64_t recno;
	const uint8_t *end, *pp;

	pp = cl->stepp;
	end = pp + opsize;
	switch (optype) {
	case WT_LOGOP_COL_MODIFY:
		WT_RET(__wt_logop_col_modify_unpack(session, &pp, end,
		    fileid, &recno, &value));
		WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
		WT_RET(__wt_buf_set(session,
		    cl->opvalue, value.data, value.size));
		break;
	case WT_LOGOP_COL_PUT:
		WT_RET(__wt_logop_col_put_unpack(session, &pp, end,
		    fileid, &recno, &value));
		WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
		WT_RET(__wt_buf_set(session,
		    cl->opvalue, value.data, value.size));
		break;
	case WT_LOGOP_COL_REMOVE:
		WT_RET(__wt_logop_col_remove_unpack(session, &pp, end,
		    fileid, &recno));
		WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
		WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0));
		break;
	case WT_LOGOP_ROW_MODIFY:
		WT_RET(__wt_logop_row_modify_unpack(session, &pp, end,
		    fileid, &key, &value));
		WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
		WT_RET(__wt_buf_set(session,
		    cl->opvalue, value.data, value.size));
		break;
	case WT_LOGOP_ROW_PUT:
		WT_RET(__wt_logop_row_put_unpack(session, &pp, end,
		    fileid, &key, &value));
		WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
		WT_RET(__wt_buf_set(session,
		    cl->opvalue, value.data, value.size));
		break;
	case WT_LOGOP_ROW_REMOVE:
		WT_RET(__wt_logop_row_remove_unpack(session, &pp, end,
		    fileid, &key));
		WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
		WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0));
		break;
	default:
		/*
		 * Any other operations return the record in the value
		 * and an empty key.
		 */
		*fileid = 0;
		WT_RET(__wt_buf_set(session, cl->opkey, NULL, 0));
		WT_RET(__wt_buf_set(session, cl->opvalue, cl->stepp, opsize));
	}
	return (0);
}
Exemple #9
0
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	bool busy, cache_work, oldgen, stalled;
	int force_attempts;

	btree = S2BT(session);

	for (oldgen = stalled = false,
	    force_attempts = 0, sleep_cnt = wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_NO_EMPTY) &&
			    __wt_delete_page_skip(session, ref, false))
				return (WT_NOTFOUND);
			/* FALLTHROUGH */
		case WT_REF_DISK:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, read it. If this thread is
			 * allowed to do eviction work, check for space in the
			 * cache.
			 */
			if (!LF_ISSET(WT_READ_NO_EVICT))
				WT_RET(__wt_cache_eviction_check(
				    session, 1, NULL));
			WT_RET(__page_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on another thread's read, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			stalled = true;
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on eviction, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			stalled = true;
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory.
			 *
			 * Get a hazard pointer if one is required. We cannot
			 * be evicting if no hazard pointer is required, we're
			 * done.
			 */
			if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
				goto skip_evict;

			/*
			 * The expected reason we can't get a hazard pointer is
			 * because the page is being evicted, yield, try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			/*
			 * If eviction is configured for this file, check to see
			 * if the page qualifies for forced eviction and update
			 * the page's generation number. If eviction isn't being
			 * done on this file, we're done.
			 */
			if (LF_ISSET(WT_READ_NO_EVICT) ||
			    F_ISSET(session, WT_SESSION_NO_EVICTION) ||
			    F_ISSET(btree, WT_BTREE_NO_EVICTION))
				goto skip_evict;

			/*
			 * Forcibly evict pages that are too big.
			 */
			if (force_attempts < 10 &&
			    __evict_force_check(session, ref)) {
				++force_attempts;
				ret = __wt_page_release_evict(session, ref);
				/* If forced eviction fails, stall. */
				if (ret == EBUSY) {
					ret = 0;
					WT_STAT_FAST_CONN_INCR(session,
					    page_forcible_evict_blocked);
					stalled = true;
					break;
				}
				WT_RET(ret);

				/*
				 * The result of a successful forced eviction
				 * is a page-state transition (potentially to
				 * an in-memory page we can use, or a restart
				 * return for our caller), continue the outer
				 * page-acquisition loop.
				 */
				continue;
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			page = ref->page;
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen != WT_READGEN_OLDEST &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_bump(session);
skip_evict:
			/*
			 * Check if we need an autocommit transaction.
			 * Starting a transaction can trigger eviction, so skip
			 * it if eviction isn't permitted.
			 */
			return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
			    __wt_txn_autocommit_check(session));
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (stalled)
			wait_cnt += 1000;
		else if (++wait_cnt < 1000) {
			__wt_yield();
			continue;
		}

		/*
		 * If stalling and this thread is allowed to do eviction work,
		 * check if the cache needs help. If we do work for the cache,
		 * substitute that for a sleep.
		 */
		if (!LF_ISSET(WT_READ_NO_EVICT)) {
			WT_RET(
			    __wt_cache_eviction_check(session, 1, &cache_work));
			if (cache_work)
				continue;
		}
		sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
		WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
		__wt_sleep(0, sleep_cnt);
	}
}
Exemple #10
0
/*
 * __wt_curfile_open --
 *	WT_SESSION->open_cursor method for the btree cursor type.
 */
int
__wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
    WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CONFIG_ITEM cval;
	WT_DECL_RET;
	uint32_t flags;
	bool bitmap, bulk, checkpoint_wait;

	bitmap = bulk = false;
	checkpoint_wait = true;
	flags = 0;

	/*
	 * Decode the bulk configuration settings. In memory databases
	 * ignore bulk load.
	 */
	if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) {
		WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
		if (cval.type == WT_CONFIG_ITEM_BOOL ||
		    (cval.type == WT_CONFIG_ITEM_NUM &&
		    (cval.val == 0 || cval.val == 1))) {
			bitmap = false;
			bulk = cval.val != 0;
		} else if (WT_STRING_MATCH("bitmap", cval.str, cval.len))
			bitmap = bulk = true;
			/*
			 * Unordered bulk insert is a special case used
			 * internally by index creation on existing tables. It
			 * doesn't enforce any special semantics at the file
			 * level. It primarily exists to avoid some locking
			 * problems between LSM and index creation.
			 */
		else if (!WT_STRING_MATCH("unordered", cval.str, cval.len))
			WT_RET_MSG(session, EINVAL,
			    "Value for 'bulk' must be a boolean or 'bitmap'");

		if (bulk) {
			WT_RET(__wt_config_gets(session,
			    cfg, "checkpoint_wait", &cval));
			checkpoint_wait = cval.val != 0;
		}
	}

	/* Bulk handles require exclusive access. */
	if (bulk)
		LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE);

	/* Get the handle and lock it while the cursor is using it. */
	if (WT_PREFIX_MATCH(uri, "file:")) {
		/*
		 * If we are opening exclusive and don't want a bulk cursor
		 * open to fail with EBUSY due to a database-wide checkpoint,
		 * get the handle while holding the checkpoint lock.
		 */
		if (LF_ISSET(WT_DHANDLE_EXCLUSIVE) && checkpoint_wait)
			WT_WITH_CHECKPOINT_LOCK(session,
			    ret = __wt_session_get_btree_ckpt(
			    session, uri, cfg, flags));
		else
			ret = __wt_session_get_btree_ckpt(
			    session, uri, cfg, flags);
		WT_RET(ret);
	} else
		WT_RET(__wt_bad_object_type(session, uri));

	WT_ERR(__curfile_create(session, owner, cfg, bulk, bitmap, cursorp));

	return (0);

err:	/* If the cursor could not be opened, release the handle. */
	WT_TRET(__wt_session_release_btree(session));
	return (ret);
}
Exemple #11
0
/*
 * __wt_curlog_open --
 *	Initialize a log cursor.
 */
int
__wt_curlog_open(WT_SESSION_IMPL *session,
    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __curlog_compare,			/* compare */
	    __wt_cursor_equals,			/* equals */
	    __curlog_next,			/* next */
	    __wt_cursor_notsup,			/* prev */
	    __curlog_reset,			/* reset */
	    __curlog_search,			/* search */
	    __wt_cursor_search_near_notsup,	/* search-near */
	    __wt_cursor_notsup,			/* insert */
	    __wt_cursor_modify_notsup,		/* modify */
	    __wt_cursor_notsup,			/* update */
	    __wt_cursor_notsup,			/* remove */
	    __wt_cursor_notsup,			/* reserve */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __wt_cursor_notsup,			/* cache */
	    __wt_cursor_reopen_notsup,		/* reopen */
	    __curlog_close);			/* close */
	WT_CURSOR *cursor;
	WT_CURSOR_LOG *cl;
	WT_DECL_RET;
	WT_LOG *log;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0);

	conn = S2C(session);
	log = conn->log;

	WT_RET(__wt_calloc_one(session, &cl));
	cursor = (WT_CURSOR *)cl;
	*cursor = iface;
	cursor->session = (WT_SESSION *)session;
	cursor->key_format = WT_LOGC_KEY_FORMAT;
	cursor->value_format = WT_LOGC_VALUE_FORMAT;

	WT_ERR(__wt_calloc_one(session, &cl->cur_lsn));
	WT_ERR(__wt_calloc_one(session, &cl->next_lsn));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue));
	WT_INIT_LSN(cl->cur_lsn);
	WT_INIT_LSN(cl->next_lsn);

	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));

	if (log != NULL) {
		/*
		 * The user may be trying to read a log record they just wrote.
		 * Log records may be buffered, so force out any now.
		 */
		WT_ERR(__wt_log_force_write(session, 1, NULL));

		/* Log cursors block archiving. */
		__wt_readlock(session, &log->log_archive_lock);
		F_SET(cl, WT_CURLOG_ARCHIVE_LOCK);
		(void)__wt_atomic_add32(&conn->log_cursors, 1);

	}

	if (0) {
err:		WT_TRET(__curlog_close(cursor));
		*cursorp = NULL;
	}

	return (ret);
}
Exemple #12
0
/*
 * __curfile_create --
 *	Open a cursor for a given btree handle.
 */
static int
__curfile_create(WT_SESSION_IMPL *session,
    WT_CURSOR *owner, const char *cfg[], bool bulk, bool bitmap,
    WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __curfile_compare,			/* compare */
	    __curfile_equals,			/* equals */
	    __curfile_next,			/* next */
	    __curfile_prev,			/* prev */
	    __curfile_reset,			/* reset */
	    __curfile_search,			/* search */
	    __curfile_search_near,		/* search-near */
	    __curfile_insert,			/* insert */
	    __wt_cursor_modify_notsup,		/* modify */
	    __curfile_update,			/* update */
	    __curfile_remove,			/* remove */
	    __curfile_reserve,			/* reserve */
	    __wt_cursor_reconfigure,		/* reconfigure */
	    __curfile_close);			/* close */
	WT_BTREE *btree;
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE *cbt;
	WT_CURSOR_BULK *cbulk;
	WT_DECL_RET;
	size_t csize;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);

	cbt = NULL;

	btree = S2BT(session);
	WT_ASSERT(session, btree != NULL);

	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
	WT_RET(__wt_calloc(session, 1, csize, &cbt));

	cursor = &cbt->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->internal_uri = btree->dhandle->name;
	cursor->key_format = btree->key_format;
	cursor->value_format = btree->value_format;
	cbt->btree = btree;

	/*
	 * Increment the data-source's in-use counter; done now because closing
	 * the cursor will decrement it, and all failure paths from here close
	 * the cursor.
	 */
	__wt_cursor_dhandle_incr_use(session);

	if (session->dhandle->checkpoint != NULL)
		F_SET(cbt, WT_CBT_NO_TXN);

	if (bulk) {
		F_SET(cursor, WT_CURSTD_BULK);

		cbulk = (WT_CURSOR_BULK *)cbt;

		/* Optionally skip the validation of each bulk-loaded key. */
		WT_ERR(__wt_config_gets_def(
		    session, cfg, "skip_sort_check", 0, &cval));
		WT_ERR(__wt_curbulk_init(
		    session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
	}

	/*
	 * Random retrieval, row-store only.
	 * Random retrieval cursors support a limited set of methods.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
	if (cval.val != 0) {
		if (WT_CURSOR_RECNO(cursor))
			WT_ERR_MSG(session, ENOTSUP,
			    "next_random configuration not supported for "
			    "column-store objects");

		__wt_cursor_set_notsup(cursor);
		cursor->next = __wt_curfile_next_random;
		cursor->reset = __curfile_reset;

		WT_ERR(__wt_config_gets_def(
		    session, cfg, "next_random_sample_size", 0, &cval));
		if (cval.val != 0)
			cbt->next_random_sample_size = (u_int)cval.val;
	}

	/* Underlying btree initialization. */
	__wt_btcur_open(cbt);

	/*
	 * WT_CURSOR.modify supported on 'u' value formats, but the fast-path
	 * through the btree code requires log file format changes, it's not
	 * available in all versions.
	 */
	if (WT_STREQ(cursor->value_format, "u") &&
	    S2C(session)->compat_major >= WT_LOG_V2)
		cursor->modify = __curfile_modify;

	WT_ERR(__wt_cursor_init(
	    cursor, cursor->internal_uri, owner, cfg, cursorp));

	WT_STAT_CONN_INCR(session, cursor_create);
	WT_STAT_DATA_INCR(session, cursor_create);

	if (0) {
err:		/*
		 * Our caller expects to release the data handle if we fail.
		 * Disconnect it from the cursor before closing.
		 */
		if (session->dhandle != NULL)
			__wt_cursor_dhandle_decr_use(session);
		cbt->btree = NULL;
		WT_TRET(__curfile_close(cursor));
		*cursorp = NULL;
	}

	return (ret);
}
Exemple #13
0
/*
 * __wt_txn_commit --
 *	Commit the current transaction.
 */
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_OP *op;
	u_int i;

	txn = &session->txn;
	conn = S2C(session);
	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR));

	if (!F_ISSET(txn, WT_TXN_RUNNING))
		WT_RET_MSG(session, EINVAL, "No transaction is active");

	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 */
	WT_RET(__wt_config_gets_def(session, cfg, "sync", 0, &cval));

	/*
	 * If the user chose the default setting, check whether sync is enabled
	 * for this transaction (either inherited or via begin_transaction).
	 * If sync is disabled, clear the field to avoid the log write being
	 * flushed.
	 *
	 * Otherwise check for specific settings.  We don't need to check for
	 * "on" because that is the default inherited from the connection.  If
	 * the user set anything in begin_transaction, we only override with an
	 * explicit setting.
	 */
	if (cval.len == 0) {
		if (!FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH) &&
		    !F_ISSET(txn, WT_TXN_SYNC_SET))
			txn->txn_logsync = 0;
	} else {
		/*
		 * If the caller already set sync on begin_transaction then
		 * they should not be using sync on commit_transaction.
		 * Flag that as an error.
		 */
		if (F_ISSET(txn, WT_TXN_SYNC_SET))
			WT_RET_MSG(session, EINVAL,
			    "Sync already set during begin_transaction.");
		if (WT_STRING_MATCH("background", cval.str, cval.len))
			txn->txn_logsync = WT_LOG_BACKGROUND;
		else if (WT_STRING_MATCH("off", cval.str, cval.len))
			txn->txn_logsync = 0;
		/*
		 * We don't need to check for "on" here because that is the
		 * default to inherit from the connection setting.
		 */
	}

	/* Commit notification. */
	if (txn->notify != NULL)
		WT_TRET(txn->notify->notify(txn->notify,
		    (WT_SESSION *)session, txn->id, 1));

	/* If we are logging, write a commit log record. */
	if (ret == 0 && txn->mod_count > 0 &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
		/*
		 * We are about to block on I/O writing the log.
		 * Release our snapshot in case it is keeping data pinned.
		 * This is particularly important for checkpoints.
		 */
		__wt_txn_release_snapshot(session);
		ret = __wt_txn_log_commit(session, cfg);
	}

	/*
	 * If anything went wrong, roll back.
	 *
	 * !!!
	 * Nothing can fail after this point.
	 */
	if (ret != 0) {
		WT_TRET(__wt_txn_rollback(session, cfg));
		return (ret);
	}

	/* Free memory associated with updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
		__wt_txn_op_free(session, op);
	txn->mod_count = 0;

	/*
	 * We are about to release the snapshot: copy values into any
	 * positioned cursors so they don't point to updates that could be
	 * freed once we don't have a transaction ID pinned.
	 */
	if (session->ncursors > 0)
		WT_RET(__wt_session_copy_values(session));

	__wt_txn_release(session);
	return (0);
}
Exemple #14
0
/*
 * __backup_start --
 *	Start a backup.
 */
static int
__backup_start(
    WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[])
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	bool exist, log_only, target_list;

	conn = S2C(session);

	cb->next = 0;
	cb->list = NULL;
	cb->list_next = 0;

	/*
	 * Single thread hot backups: we're holding the schema lock, so we
	 * know we'll serialize with other attempts to start a hot backup.
	 */
	if (conn->hot_backup)
		WT_RET_MSG(
		    session, EINVAL, "there is already a backup cursor open");

	/*
	 * The hot backup copy is done outside of WiredTiger, which means file
	 * blocks can't be freed and re-allocated until the backup completes.
	 * The checkpoint code checks the backup flag, and if a backup cursor
	 * is open checkpoints aren't discarded. We release the lock as soon
	 * as we've set the flag, we don't want to block checkpoints, we just
	 * want to make sure no checkpoints are deleted.  The checkpoint code
	 * holds the lock until it's finished the checkpoint, otherwise we
	 * could start a hot backup that would race with an already-started
	 * checkpoint.
	 */
	WT_RET(__wt_writelock(session, conn->hot_backup_lock));
	conn->hot_backup = true;
	WT_ERR(__wt_writeunlock(session, conn->hot_backup_lock));

	/* Create the hot backup file. */
	WT_ERR(__backup_file_create(session, cb, false));

	/* Add log files if logging is enabled. */

	/*
	 * If a list of targets was specified, work our way through them.
	 * Else, generate a list of all database objects.
	 *
	 * Include log files if doing a full backup, and copy them before
	 * copying data files to avoid rolling the metadata forward across
	 * a checkpoint that completes during the backup.
	 */
	target_list = false;
	WT_ERR(__backup_uri(session, cfg, &target_list, &log_only));

	if (!target_list) {
		WT_ERR(__backup_log_append(session, cb, true));
		WT_ERR(__backup_all(session));
	}

	/* Add the hot backup and standard WiredTiger files to the list. */
	if (log_only) {
		/*
		 * Close any hot backup file.
		 * We're about to open the incremental backup file.
		 */
		WT_TRET(__wt_fclose(&cb->bfp, WT_FHANDLE_WRITE));
		WT_ERR(__backup_file_create(session, cb, log_only));
		WT_ERR(__backup_list_append(
		    session, cb, WT_INCREMENTAL_BACKUP));
	} else {
		WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP));
		WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist));
		if (exist)
			WT_ERR(__backup_list_append(
			    session, cb, WT_BASECONFIG));
		WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist));
		if (exist)
			WT_ERR(__backup_list_append(
			    session, cb, WT_USERCONFIG));
		WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER));
	}

err:	/* Close the hot backup file. */
	WT_TRET(__wt_fclose(&cb->bfp, WT_FHANDLE_WRITE));
	if (ret != 0) {
		WT_TRET(__backup_cleanup_handles(session, cb));
		WT_TRET(__backup_stop(session));
	}

	return (ret);
}
Exemple #15
0
/*
 * __conn_dhandle_open_lock --
 *	Spin on the current data handle until either (a) it is open, read
 *	locked; or (b) it is closed, write locked.  If exclusive access is
 *	requested and cannot be granted immediately because the handle is
 *	in use, fail with EBUSY.
 *
 *	Here is a brief summary of how different operations synchronize using
 *	either the schema lock, handle locks or handle flags:
 *
 *	open -- holds the schema lock, one thread gets the handle exclusive,
 *		reverts to a shared handle lock and drops the schema lock
 *		once the handle is open;
 *	bulk load -- sets bulk and exclusive;
 *	salvage, truncate, update, verify -- hold the schema lock, set a
 *		"special" flag;
 *	sweep -- gets a write lock on the handle, doesn't set exclusive
 *
 *	The schema lock prevents a lot of potential conflicts: we should never
 *	see handles being salvaged or verified because those operation hold the
 *	schema lock.  However, it is possible to see a handle that is being
 *	bulk loaded, or that the sweep server is closing.
 *
 *	The principle here is that application operations can cause other
 *	application operations to fail (so attempting to open a cursor on a
 *	file while it is being bulk-loaded will fail), but internal or
 *	database-wide operations should not prevent application-initiated
 *	operations.  For example, attempting to verify a file should not fail
 *	because the sweep server happens to be in the process of closing that
 *	file.
 */
static int
__conn_dhandle_open_lock(
    WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, uint32_t flags)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	int is_open, lock_busy, want_exclusive;

	btree = dhandle->handle;
	lock_busy = 0;
	want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE) ? 1 : 0;

	/*
	 * Check that the handle is open.  We've already incremented
	 * the reference count, so once the handle is open it won't be
	 * closed by another thread.
	 *
	 * If we can see the WT_DHANDLE_OPEN flag set while holding a
	 * lock on the handle, then it's really open and we can start
	 * using it.  Alternatively, if we can get an exclusive lock
	 * and WT_DHANDLE_OPEN is still not set, we need to do the open.
	 */
	for (;;) {
		/*
		 * If the handle is already open for a special operation,
		 * give up.
		 */
		if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
			return (EBUSY);

		/*
		 * If the handle is open, get a read lock and recheck.
		 *
		 * Wait for a read lock if we want exclusive access and failed
		 * to get it: the sweep server may be closing this handle, and
		 * we need to wait for it to complete.  If we want exclusive
		 * access and find the handle open once we get the read lock,
		 * give up: some other thread has it locked for real.
		 */
		if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
		    (!want_exclusive || lock_busy)) {
			WT_RET(__wt_readlock(session, dhandle->rwlock));
			is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN) ? 1 : 0;
			if (is_open && !want_exclusive)
				return (0);
			WT_RET(__wt_readunlock(session, dhandle->rwlock));
		} else
			is_open = 0;

		/*
		 * It isn't open or we want it exclusive: try to get an
		 * exclusive lock.  There is some subtlety here: if we race
		 * with another thread that successfully opens the file, we
		 * don't want to block waiting to get exclusive access.
		 */
		if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) {
			/*
			 * If it was opened while we waited, drop the write
			 * lock and get a read lock instead.
			 */
			if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
			    !want_exclusive) {
				lock_busy = 0;
				WT_RET(
				    __wt_writeunlock(session, dhandle->rwlock));
				continue;
			}

			/* We have an exclusive lock, we're done. */
			F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
			return (0);
		} else if (ret != EBUSY || (is_open && want_exclusive))
			return (ret);
		else
			lock_busy = 1;

		/* Give other threads a chance to make progress. */
		__wt_yield();
	}
}
Exemple #16
0
/*
 * __wt_col_modify --
 *	Column-store delete, insert, and update.
 */
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_INSERT *ins, *ins_copy;
	WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist;
	WT_ITEM *value, _value;
	WT_PAGE *page;
	WT_UPDATE *old_upd, *upd, *upd_obsolete;
	size_t ins_size, new_inshead_size, new_inslist_size, upd_size;
	uint64_t recno;
	u_int skipdepth;
	int i, logged;

	btree = cbt->btree;
	page = cbt->page;
	recno = cbt->iface.recno;
	logged = 0;

	WT_ASSERT(session, op != 1);

	switch (op) {
	case 2:						/* Remove */
		if (btree->type == BTREE_COL_FIX) {
			value = &_value;
			value->data = "";
			value->size = 1;
		} else
			value = NULL;
		break;
	case 3:						/* Insert/Update */
	default:
		value = &cbt->iface.value;

		/*
		 * There's some chance the application specified a record past
		 * the last record on the page.  If that's the case, and we're
		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
		 * append list, not the update list.
		 */
		if (recno == 0 || recno > __col_last_recno(page))
			op = 1;
		break;
	}

	/* If we don't yet have a modify structure, we'll need one. */
	WT_RET(__wt_page_modify_init(session, page));

	ins = NULL;
	new_inshead = NULL;
	new_inslist = NULL;
	upd = NULL;

	/*
	 * Delete, insert or update a column-store entry.
	 *
	 * If modifying a previously modified record, create a new WT_UPDATE
	 * entry and have a serialized function link it into an existing
	 * WT_INSERT entry's WT_UPDATE list.
	 *
	 * Else, allocate an insert array as necessary, build a WT_INSERT and
	 * WT_UPDATE structure pair, and call a serialized function to insert
	 * the WT_INSERT structure.
	 */
	if (cbt->compare == 0 && cbt->ins != NULL) {
		/* Make sure the update can proceed. */
		WT_ERR(
		    __wt_update_check(session, page, old_upd = cbt->ins->upd));

		/* Allocate the WT_UPDATE structure and transaction ID. */
		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
		WT_ERR(__wt_txn_modify(session, &upd->txnid));
		logged = 1;

		/* Serialize the update. */
		WT_ERR(__wt_update_serial(session, page,
		    cbt->write_gen, &cbt->ins->upd, old_upd,
		    NULL, 0, &upd, upd_size, &upd_obsolete));

		/* Discard any obsolete WT_UPDATE structures. */
		if (upd_obsolete != NULL)
			__wt_update_obsolete_free(session, page, upd_obsolete);
	} else {
		/* Make sure the update can proceed. */
		WT_ERR(__wt_update_check(session, page, NULL));

		/* There may be no insert list, allocate as necessary. */
		new_inshead_size = new_inslist_size = 0;
		if (op == 1) {
			if (page->modify->append == NULL) {
				new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *);
				WT_ERR(
				    __wt_calloc_def(session, 1, &new_inslist));
				inshead = &new_inslist[0];
			} else
				inshead = &page->modify->append[0];
			cbt->ins_head = *inshead;
		} else if (page->type == WT_PAGE_COL_FIX) {
			if (page->modify->update == NULL) {
				new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *);
				WT_ERR(
				    __wt_calloc_def(session, 1, &new_inslist));
				inshead = &new_inslist[0];
			} else
				inshead = &page->modify->update[0];
		} else {
			if (page->modify->update == NULL) {
				new_inslist_size =
				    page->entries * sizeof(WT_INSERT_HEAD *);
				WT_ERR(__wt_calloc_def(
				    session, page->entries, &new_inslist));
				inshead = &new_inslist[cbt->slot];
			} else
				inshead = &page->modify->update[cbt->slot];
		}

		/* There may be no WT_INSERT list, allocate as necessary. */
		if (*inshead == NULL) {
			new_inshead_size = sizeof(WT_INSERT_HEAD);
			WT_ERR(__wt_calloc_def(session, 1, &new_inshead));
			for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
				cbt->ins_stack[i] = &new_inshead->head[i];
				cbt->next_stack[i] = NULL;
			}
			cbt->ins_head = new_inshead;
		}

		/* Choose a skiplist depth for this insert. */
		skipdepth = __wt_skip_choose_depth();

		/*
		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
		 * update the cursor to reference it.
		 */
		WT_ERR(__col_insert_alloc(
		    session, recno, skipdepth, &ins, &ins_size));
		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
		WT_ERR(__wt_txn_modify(session, &upd->txnid));
		logged = 1;
		ins->upd = upd;
		ins_size += upd_size;
		cbt->ins = ins;

		/* Insert or append the WT_INSERT structure. */
		if (op == 1) {
			/*
			 * The serialized function clears ins: take a copy of
			 * the pointer so we can look up the record number.
			 */
			ins_copy = ins;

			WT_ERR(__wt_col_append_serial(session,
			    page, cbt->write_gen, inshead,
			    cbt->ins_stack, cbt->next_stack,
			    &new_inslist, new_inslist_size,
			    &new_inshead, new_inshead_size,
			    &ins, ins_size, skipdepth));

			/* Put the new recno into the cursor. */
			cbt->recno = WT_INSERT_RECNO(ins_copy);
		} else
			WT_ERR(__wt_insert_serial(session,
			    page, cbt->write_gen, inshead,
			    cbt->ins_stack, cbt->next_stack,
			    &new_inslist, new_inslist_size,
			    &new_inshead, new_inshead_size,
			    &ins, ins_size, skipdepth));
	}

	if (0) {
err:		/*
		 * Remove the update from the current transaction, so we don't
		 * try to modify it on rollback.
		 */
		if (logged)
			__wt_txn_unmodify(session);
		__wt_free(session, ins);
		__wt_free(session, upd);
	}

	__wt_free(session, new_inslist);
	__wt_free(session, new_inshead);

	return (ret);
}
Exemple #17
0
/*
 * __wt_schema_project_in --
 *	Given list of cursors and a projection, read columns from the
 *	application into the dependent cursors.
 */
int
__wt_schema_project_in(WT_SESSION_IMPL *session,
    WT_CURSOR **cp, const char *proj_arg, va_list ap)
{
	WT_CURSOR *c;
	WT_DECL_ITEM(buf);
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_PACK(pack);
	WT_PACK_VALUE old_pv;
	size_t len, offset, old_len;
	u_long arg;
	char *proj;
	uint8_t *p, *end;
	const uint8_t *next;

	p = end = NULL;		/* -Wuninitialized */

	/* Reset any of the buffers we will be setting. */
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);
		if (*proj == WT_PROJ_KEY) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->key, 0));
		} else if (*proj == WT_PROJ_VALUE) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->value, 0));
		}
	}

	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;
		}

		/* We have to get a key or value before any operations. */
		WT_ASSERT(session, buf != NULL);

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_SKIP:
				WT_RET(__pack_next(&pack, &pv));
				/*
				 * A nasty case: if we are inserting
				 * out-of-order, we may reach the end of the
				 * data.  That's okay: we want to append in
				 * that case, and we're positioned to do that.
				 */
				if (p == end) {
					/* Set up an empty value. */
					WT_CLEAR(pv.u);
					if (pv.type == 'S' || pv.type == 's')
						pv.u.s = "";

					len = __pack_size(session, &pv);
					WT_RET(__wt_buf_grow(session,
					    buf, buf->size + len));
					p = (uint8_t *)buf->mem + buf->size;
					WT_RET(__pack_write(
					    session, &pv, &p, len));
					buf->size += len;
					end = (uint8_t *)buf->mem + buf->size;
				} else if (*proj == WT_PROJ_SKIP)
					WT_RET(__unpack_read(session,
					    &pv, (const uint8_t **)&p,
					    (size_t)(end - p)));
				break;

			case WT_PROJ_NEXT:
				WT_RET(__pack_next(&pack, &pv));
				WT_PACK_GET(session, pv, ap);
				/* FALLTHROUGH */

			case WT_PROJ_REUSE:
				/* Read the item we're about to overwrite. */
				next = p;
				if (p < end) {
					old_pv = pv;
					WT_RET(__unpack_read(session, &old_pv,
					    &next, (size_t)(end - p)));
				}
				old_len = (size_t)(next - p);

				len = __pack_size(session, &pv);
				offset = WT_PTRDIFF(p, buf->mem);
				WT_RET(__wt_buf_grow(session,
				    buf, buf->size + len));
				p = (uint8_t *)buf->mem + offset;
				end = (uint8_t *)buf->mem + buf->size + len;
				/* Make room if we're inserting out-of-order. */
				if (offset + old_len < buf->size)
					memmove(p + len, p + old_len,
					    buf->size - (offset + old_len));
				WT_RET(__pack_write(session, &pv, &p, len));
				buf->size += len;
				break;

			default:
				WT_RET_MSG(session, EINVAL,
				    "unexpected projection plan: %c",
				    (int)*proj);
			}
		}
	}

	return (0);
}
Exemple #18
0
/*
 * __wt_col_append_serial_func --
 *	Server function to append an WT_INSERT entry to the tree.
 */
int
__wt_col_append_serial_func(WT_SESSION_IMPL *session, void *args)
{
	WT_BTREE *btree;
	WT_INSERT *ins, *new_ins, ***ins_stack, **next_stack;
	WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead;
	WT_PAGE *page;
	uint64_t recno;
	uint32_t write_gen;
	u_int i, skipdepth;

	btree = S2BT(session);

	__wt_col_append_unpack(args,
	    &page, &write_gen, &insheadp, &ins_stack, &next_stack,
	    &new_inslist, &new_inshead, &new_ins, &skipdepth);

	/* Check the page's write-generation. */
	WT_RET(__wt_page_write_gen_check(session, page, write_gen));

	if ((inshead = *insheadp) == NULL)
		inshead = new_inshead;

	/*
	 * If the application specified a record number, there's a race: the
	 * application may have searched for the record, not found it, then
	 * called into the append code, and another thread might have added
	 * the record.  Fortunately, we're in the right place because if the
	 * record didn't exist at some point, it can only have been created
	 * on this list.  Search for the record, if specified.
	 */
	if ((recno = WT_INSERT_RECNO(new_ins)) == 0)
		recno = WT_INSERT_RECNO(new_ins) = ++btree->last_recno;

	ins = __col_insert_search(inshead, ins_stack, next_stack, recno);

	/* If we find the record number, there's been a race. */
	if (ins != NULL && WT_INSERT_RECNO(ins) == recno)
		WT_RET(WT_RESTART);

	/*
	 * Publish: First, point the new WT_INSERT item's skiplist references
	 * to the next elements in the insert list, then flush memory.  Second,
	 * update the skiplist elements that reference the new WT_INSERT item,
	 * this ensures the list is never inconsistent.
	 */
	for (i = 0; i < skipdepth; i++)
		new_ins->next[i] = *ins_stack[i];
	WT_WRITE_BARRIER();
	for (i = 0; i < skipdepth; i++) {
		if (inshead->tail[i] == NULL ||
		    ins_stack[i] == &inshead->tail[i]->next[i])
			inshead->tail[i] = new_ins;
		*ins_stack[i] = new_ins;
	}

	__wt_col_append_new_ins_taken(args);

	/*
	 * If the insert head does not yet have an insert list, our caller
	 * passed us one.
	 *
	 * NOTE: it is important to do this after the item has been added to
	 * the list.  Code can assume that if the list is set, it is non-empty.
	 */
	if (*insheadp == NULL) {
		WT_PUBLISH(*insheadp, new_inshead);
		__wt_col_append_new_inshead_taken(args);
	}

	/*
	 * If the page does not yet have an insert array, our caller passed
	 * us one.
	 *
	 * NOTE: it is important to do this after publishing the list entry.
	 * Code can assume that if the array is set, it is non-empty.
	 */
	if (page->modify->append == NULL) {
		page->modify->append = new_inslist;
		__wt_col_append_new_inslist_taken(args);
	}

	/*
	 * If we don't find the record, check to see if we extended the file,
	 * and update the last record number.
	 */
	if (recno > btree->last_recno)
		btree->last_recno = recno;

	__wt_page_and_tree_modify_set(session, page);
	return (0);
}
Exemple #19
0
/*
 * __wt_schema_project_slice --
 *	Given list of cursors and a projection, read columns from the
 *	a raw buffer.
 */
int
__wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp,
    const char *proj_arg, bool key_only, const char *vformat, WT_ITEM *value)
{
	WT_CURSOR *c;
	WT_DECL_ITEM(buf);
	WT_DECL_PACK(pack);
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_PACK_VALUE(vpv);
	WT_PACK vpack;
	u_long arg;
	char *proj;
	uint8_t *end, *p;
	const uint8_t *next, *vp, *vend;
	size_t len, offset, old_len;
	bool skip;

	p = end = NULL;		/* -Wuninitialized */

	WT_RET(__pack_init(session, &vpack, vformat));
	vp = value->data;
	vend = vp + value->size;

	/* Reset any of the buffers we will be setting. */
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);
		if (*proj == WT_PROJ_KEY) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->key, 0));
		} else if (*proj == WT_PROJ_VALUE && !key_only) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->value, 0));
		}
	}

	skip = key_only;
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			skip = false;
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			skip = key_only;
			if (skip)
				continue;
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;
		}

		/* We have to get a key or value before any operations. */
		WT_ASSERT(session, skip || buf != NULL);

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_SKIP:
				if (skip)
					break;
				WT_RET(__pack_next(&pack, &pv));

				/*
				 * A nasty case: if we are inserting
				 * out-of-order, append a zero value to keep
				 * the buffer in the correct format.
				 */
				if (p == end) {
					/* Set up an empty value. */
					WT_CLEAR(pv.u);
					if (pv.type == 'S' || pv.type == 's')
						pv.u.s = "";

					len = __pack_size(session, &pv);
					WT_RET(__wt_buf_grow(session,
					    buf, buf->size + len));
					p = (uint8_t *)buf->data + buf->size;
					WT_RET(__pack_write(
					    session, &pv, &p, len));
					end = p;
					buf->size += len;
				} else
					WT_RET(__unpack_read(session,
					    &pv, (const uint8_t **)&p,
					    (size_t)(end - p)));
				break;

			case WT_PROJ_NEXT:
				WT_RET(__pack_next(&vpack, &vpv));
				WT_RET(__unpack_read(session, &vpv,
				    &vp, (size_t)(vend - vp)));
				/* FALLTHROUGH */

			case WT_PROJ_REUSE:
				if (skip)
					break;

				/*
				 * Read the item we're about to overwrite.
				 *
				 * There is subtlety here: the value format
				 * may not exactly match the cursor's format.
				 * In particular, we need lengths with raw
				 * columns in the middle of a packed struct,
				 * but not if they are at the end of a struct.
				 */
				WT_RET(__pack_next(&pack, &pv));

				next = p;
				if (p < end)
					WT_RET(__unpack_read(session, &pv,
					    &next, (size_t)(end - p)));
				old_len = (size_t)(next - p);

				/* Make sure the types are compatible. */
				WT_ASSERT(session,
				    __wt_tolower((u_char)pv.type) ==
				    __wt_tolower((u_char)vpv.type));
				pv.u = vpv.u;

				len = __pack_size(session, &pv);
				offset = WT_PTRDIFF(p, buf->data);
				/*
				 * Avoid growing the buffer if the value fits.
				 * This is not just a performance issue: it
				 * covers the case of record number keys, which
				 * have to be written to cursor->recno.
				 */
				if (len > old_len)
					WT_RET(__wt_buf_grow(session,
					    buf, buf->size + len - old_len));
				p = (uint8_t *)buf->data + offset;
				/* Make room if we're inserting out-of-order. */
				if (offset + old_len < buf->size)
					memmove(p + len, p + old_len,
					    buf->size - (offset + old_len));
				WT_RET(__pack_write(session, &pv, &p, len));
				buf->size += len - old_len;
				end = (uint8_t *)buf->data + buf->size;
				break;
			default:
				WT_RET_MSG(session, EINVAL,
				    "unexpected projection plan: %c",
				    (int)*proj);
			}
		}
	}

	return (0);
}
Exemple #20
0
/*
 * __ovfl_reuse_wrapup --
 *	Resolve the page's overflow reuse list after a page is written.
 */
static int
__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BM *bm;
	WT_OVFL_REUSE **e, **head, *reuse;
	size_t decr;
	int i;

	bm = S2BT(session)->bm;
	head = page->modify->ovfl_track->ovfl_reuse;

	/*
	 * Discard any overflow records that aren't in-use, freeing underlying
	 * blocks.
	 *
	 * First, walk the overflow reuse lists (except for the lowest one),
	 * fixing up skiplist links.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
		for (e = &head[i]; (reuse = *e) != NULL;) {
			if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
				e = &reuse->next[i];
				continue;
			}
			*e = reuse->next[i];
		}

	/*
	 * Second, discard any overflow record without an in-use flag, clear
	 * the flags for the next run.
	 *
	 * As part of the pass through the lowest level, figure out how much
	 * space we added/subtracted from the page, and update its footprint.
	 * We don't get it exactly correct because we don't know the depth of
	 * the skiplist here, but it's close enough, and figuring out the
	 * memory footprint change in the reconciliation wrapup code means
	 * fewer atomic updates and less code overall.
	 */
	decr = 0;
	for (e = &head[0]; (reuse = *e) != NULL;) {
		if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
			F_CLR(reuse,
			    WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
			e = &reuse->next[0];
			continue;
		}
		*e = reuse->next[0];

		WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED));

		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
			WT_RET(
			    __ovfl_reuse_verbose(session, page, reuse, "free"));

		WT_RET(bm->free(
		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
		decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE);
		__wt_free(session, reuse);
	}

	if (decr != 0)
		__wt_cache_page_inmem_decr(session, page, decr);
	return (0);
}
Exemple #21
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
	uint64_t saved_snap_min;
	uint32_t flags;
	bool evict_reset;

	btree = S2BT(session);

	walk = NULL;
	txn = &session->txn;
	saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min;
	flags = WT_READ_CACHE | WT_READ_NO_GEN;

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
		WT_RET(__wt_epoch(session, &start));

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write the hottest pages: checkpoint will have
			 * to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    __wt_txn_visible_all(
			    session, page->modify->update_txn)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * If we are flushing a file at read-committed isolation, which
		 * is of particular interest for flushing the metadata to make
		 * schema-changing operation durable, get a transactional
		 * snapshot now.
		 *
		 * All changes committed up to this point should be included.
		 * We don't update the snapshot in between pages because (a)
		 * the metadata shouldn't be that big, and (b) if we do ever
		 */
		if (txn->isolation == WT_ISO_READ_COMMITTED)
			__wt_txn_get_snapshot(session);

		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * When internal pages are being reconciled by checkpoint their
		 * child pages cannot disappear from underneath them or be split
		 * into them, nor can underlying blocks be freed until the block
		 * lists for the checkpoint are stable.  Set the checkpointing
		 * flag to block eviction of dirty pages until the checkpoint's
		 * internal page pass is complete, then wait for any existing
		 * eviction to complete.
		 */
		WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);

		WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
		if (evict_reset)
			__wt_evict_file_exclusive_off(session);

		WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING);

		/* Write all dirty in-cache pages. */
		flags |= WT_READ_NO_EVICT;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			/* Skip clean pages. */
			if (!__wt_page_is_modified(walk->page))
				continue;

			/*
			 * Take a local reference to the page modify structure
			 * now that we know the page is dirty. It needs to be
			 * done in this order otherwise the page modify
			 * structure could have been created between taking the
			 * reference and checking modified.
			 */
			page = walk->page;
			mod = page->modify;

			/*
			 * Write dirty pages, unless we can be sure they only
			 * became dirty after the checkpoint started.
			 *
			 * We can skip dirty pages if:
			 * (1) they are leaf pages;
			 * (2) there is a snapshot transaction active (which
			 *     is the case in ordinary application checkpoints
			 *     but not all internal cases); and
			 * (3) the first dirty update on the page is
			 *     sufficiently recent that the checkpoint
			 *     transaction would skip them.
			 *
			 * Mark the tree dirty: the checkpoint marked it clean
			 * and we can't skip future checkpoints until this page
			 * is written.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
			    WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
				__wt_page_modify_set(session, page);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}
			WT_ERR(__wt_reconcile(session, walk, NULL, 0));
		}
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
	WT_ILLEGAL_VALUE_ERR(session);
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_ERR(__wt_epoch(session, &end));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of internal\n\t"
		    "Took: %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
		    WT_TIMEDIFF_MS(end, start)));
	}

err:	/* On error, clear any left-over tree walk. */
	if (walk != NULL)
		WT_TRET(__wt_page_release(session, walk, flags));

	/*
	 * If we got a snapshot in order to write pages, and there was no
	 * snapshot active when we started, release it.
	 */
	if (txn->isolation == WT_ISO_READ_COMMITTED &&
	    saved_snap_min == WT_TXN_NONE)
		__wt_txn_release_snapshot(session);

	if (btree->checkpointing != WT_CKPT_OFF) {
		/*
		 * Update the checkpoint generation for this handle so visible
		 * updates newer than the checkpoint can be evicted.
		 *
		 * This has to be published before eviction is enabled again,
		 * so that eviction knows that the checkpoint has completed.
		 */
		WT_PUBLISH(btree->checkpoint_gen,
		    S2C(session)->txn_global.checkpoint_gen);
		WT_STAT_FAST_DATA_SET(session,
		    btree_checkpoint_generation, btree->checkpoint_gen);

		/*
		 * Clear the checkpoint flag and push the change; not required,
		 * but publishing the change means stalled eviction gets moving
		 * as soon as possible.
		 */
		btree->checkpointing = WT_CKPT_OFF;
		WT_FULL_BARRIER();

		/*
		 * If this tree was being skipped by the eviction server during
		 * the checkpoint, clear the wait.
		 */
		btree->evict_walk_period = 0;

		/*
		 * Wake the eviction server, in case application threads have
		 * stalled while the eviction server decided it couldn't make
		 * progress.  Without this, application threads will be stalled
		 * until the eviction server next wakes.
		 */
		WT_TRET(__wt_evict_server_wake(session));
	}

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
		WT_RET(btree->bm->sync(btree->bm, session, true));

	return (ret);
}
Exemple #22
0
/*
 * __wt_bt_read --
 *	Read a cookie referenced block into a buffer.
 */
int
__wt_bt_read(WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_PAGE_HEADER *dsk;
	size_t result_len;

	btree = session->btree;
	bm = btree->bm;

	/*
	 * If anticipating a compressed block, read into a scratch buffer and
	 * decompress into the caller's buffer.  Else, read directly into the
	 * caller's buffer.
	 */
	if (btree->compressor == NULL) {
		WT_RET(bm->read(bm, session, buf, addr, addr_size));
		dsk = buf->mem;
	} else {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
		dsk = tmp->mem;
	}

	/*
	 * If the block is compressed, copy the skipped bytes of the original
	 * image into place, then decompress.
	 */
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
		if (btree->compressor == NULL ||
		    btree->compressor->decompress == NULL)
			WT_ERR_MSG(session, WT_ERROR,
			    "read compressed block where no compression engine "
			    "configured");

		/*
		 * We're allocating the exact number of bytes we're expecting
		 * from decompression.
		 */
		WT_ERR(__wt_buf_init(session, buf, dsk->mem_size));
		buf->size = dsk->mem_size;

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));

		/*
		 * If checksums were turned off because we're depending on the
		 * decompression to fail on any corrupted data, we'll end up
		 * here after corruption happens.  If we're salvaging the file,
		 * it's OK, otherwise it's really, really bad.
		 */
		if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
			WT_ERR(
			    F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR) ?
			    WT_ERROR :
			    __wt_illegal_value(session, btree->name));
	} else
		if (btree->compressor == NULL)
			buf->size = dsk->mem_size;
		else
			/*
			 * We guessed wrong: there was a compressor, but this
			 * block was not compressed, and now the page is in the
			 * wrong buffer and the buffer may be of the wrong size.
			 * This should be rare, but happens with small blocks
			 * that aren't worth compressing.
			 */
			WT_ERR(__wt_buf_set(
			    session, buf, tmp->data, dsk->mem_size));

	/* If the handle is a verify handle, verify the physical page. */
	if (F_ISSET(btree, WT_BTREE_VERIFY)) {
		if (tmp == NULL)
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
		WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
	}

	WT_CSTAT_INCR(session, cache_read);
	WT_DSTAT_INCR(session, cache_read);
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
		WT_DSTAT_INCR(session, compress_read);
	WT_CSTAT_INCRV(session, cache_bytes_read, addr_size);
	WT_DSTAT_INCRV(session, cache_bytes_read, addr_size);

err:	__wt_scr_free(&tmp);
	return (ret);
}
Exemple #23
0
/*
 * __wt_lsm_merge --
 *	Merge a set of chunks of an LSM tree.
 */
int
__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
{
	WT_BLOOM *bloom;
	WT_CURSOR *dest, *src;
	WT_DECL_RET;
	WT_ITEM key, value;
	WT_LSM_CHUNK *chunk;
	uint32_t generation;
	uint64_t insert_count, record_count;
	u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id;
	u_int created_chunk, verb;
	int create_bloom, locked, in_sync, tret;
	const char *cfg[3];
	const char *drop_cfg[] =
	    { WT_CONFIG_BASE(session, session_drop), "force", NULL };

	bloom = NULL;
	chunk = NULL;
	create_bloom = 0;
	created_chunk = 0;
	dest = src = NULL;
	locked = 0;
	start_id = 0;
	in_sync = 0;

	/* Fast path if it's obvious no merges could be done. */
	if (lsm_tree->nchunks < lsm_tree->merge_min &&
	    lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD)
		return (WT_NOTFOUND);

	/*
	 * Use the lsm_tree lock to read the chunks (so no switches occur), but
	 * avoid holding it while the merge is in progress: that may take a
	 * long time.
	 */
	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = 1;

	WT_ERR(__lsm_merge_span(session,
	    lsm_tree, id, &start_chunk, &end_chunk, &record_count));
	nchunks = (end_chunk + 1) - start_chunk;

	WT_ASSERT(session, nchunks > 0);
	start_id = lsm_tree->chunk[start_chunk]->id;

	/* Find the merge generation. */
	for (generation = 0, i = 0; i < nchunks; i++)
		generation = WT_MAX(generation,
		    lsm_tree->chunk[start_chunk + i]->generation + 1);

	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
	locked = 0;

	/* Allocate an ID for the merge. */
	dest_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);

	/*
	 * We only want to do the chunk loop if we're running with verbose,
	 * so we wrap these statements in the conditional.  Avoid the loop
	 * in the normal path.
	 */
	if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) {
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "Merging %s chunks %u-%u into %u (%" PRIu64 " records)"
		    ", generation %" PRIu32,
		    lsm_tree->name,
		    start_chunk, end_chunk, dest_id, record_count, generation));
		for (verb = start_chunk; verb <= end_chunk; verb++)
			WT_ERR(__wt_verbose(session, WT_VERB_LSM,
			    "%s: Chunk[%u] id %u",
			    lsm_tree->name, verb, lsm_tree->chunk[verb]->id));
	}

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	created_chunk = 1;
	chunk->id = dest_id;

	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) &&
	    (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
	    start_chunk > 0) && record_count > 0)
		create_bloom = 1;

	/*
	 * Special setup for the merge cursor:
	 * first, reset to open the dependent cursors;
	 * then restrict the cursor to a specific number of chunks;
	 * then set MERGE so the cursor doesn't track updates to the tree.
	 */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));

	WT_WITH_SCHEMA_LOCK(session,
	    ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
	WT_ERR(ret);
	if (create_bloom) {
		WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

		WT_ERR(__wt_bloom_create(session, chunk->bloom_uri,
		    lsm_tree->bloom_config,
		    record_count, lsm_tree->bloom_bit_count,
		    lsm_tree->bloom_hash_count, &bloom));
	}

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
	cfg[1] = "bulk,raw,skip_sort_check";
	cfg[2] = NULL;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));

#define	LSM_MERGE_CHECK_INTERVAL	1000
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
			if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
				WT_ERR(EINTR);

			WT_STAT_FAST_CONN_INCRV(session,
			    lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
			++lsm_tree->merge_progressing;
		}

		WT_ERR(src->get_key(src, &key));
		dest->set_key(dest, &key);
		WT_ERR(src->get_value(src, &value));
		dest->set_value(dest, &value);
		WT_ERR(dest->insert(dest));
		if (create_bloom)
			WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_STAT_FAST_CONN_INCRV(session,
	    lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
	++lsm_tree->merge_progressing;
	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
	    "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.",
	    record_count, insert_count));

	/*
	 * Closing and syncing the files can take a while.  Set the
	 * merge_syncing field so that compact knows it is still in
	 * progress.
	 */
	(void)WT_ATOMIC_ADD4(lsm_tree->merge_syncing, 1);
	in_sync = 1;
	/*
	 * We've successfully created the new chunk.  Now install it.  We need
	 * to ensure that the NO_CACHE flag is cleared and the bloom filter
	 * is closed (even if a step fails), so track errors but don't return
	 * until we've cleaned up.
	 */
	WT_TRET(src->close(src));
	WT_TRET(dest->close(dest));
	src = dest = NULL;

	F_CLR(session, WT_SESSION_NO_CACHE);

	/*
	 * We're doing advisory reads to fault the new trees into cache.
	 * Don't block if the cache is full: our next unit of work may be to
	 * discard some trees to free space.
	 */
	F_SET(session, WT_SESSION_NO_CACHE_CHECK);

	if (create_bloom) {
		if (ret == 0)
			WT_TRET(__wt_bloom_finalize(bloom));

		/*
		 * Read in a key to make sure the Bloom filters btree handle is
		 * open before it becomes visible to application threads.
		 * Otherwise application threads will stall while it is opened
		 * and internal pages are read into cache.
		 */
		if (ret == 0) {
			WT_CLEAR(key);
			WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
		}

		WT_TRET(__wt_bloom_close(bloom));
		bloom = NULL;
	}
	WT_ERR(ret);

	/*
	 * Open a handle on the new chunk before application threads attempt
	 * to access it, opening it pre-loads internal pages into the file
	 * system cache.
	 */
	cfg[1] = "checkpoint=" WT_CHECKPOINT;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
	WT_TRET(dest->close(dest));
	dest = NULL;
	++lsm_tree->merge_progressing;
	(void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
	in_sync = 0;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = 1;

	/*
	 * Check whether we raced with another merge, and adjust the chunk
	 * array offset as necessary.
	 */
	if (start_chunk >= lsm_tree->nchunks ||
	    lsm_tree->chunk[start_chunk]->id != start_id)
		for (start_chunk = 0;
		    start_chunk < lsm_tree->nchunks;
		    start_chunk++)
			if (lsm_tree->chunk[start_chunk]->id == start_id)
				break;

	/*
	 * It is safe to error out here - since the update can only fail
	 * prior to making updates to the tree.
	 */
	WT_ERR(__wt_lsm_merge_update_tree(
	    session, lsm_tree, start_chunk, nchunks, chunk));

	if (create_bloom)
		F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	chunk->count = insert_count;
	chunk->generation = generation;
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);

	/*
	 * We have no current way of continuing if the metadata update fails,
	 * so we will panic in that case.  Put some effort into cleaning up
	 * after ourselves here - so things have a chance of shutting down.
	 *
	 * Any errors that happened after the tree was locked are
	 * fatal - we can't guarantee the state of the tree.
	 */
	if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0)
		WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");

	lsm_tree->dsk_gen++;

	/* Update the throttling while holding the tree lock. */
	__wt_lsm_tree_throttle(session, lsm_tree, 1);

	/* Schedule a pass to discard old chunks */
	WT_ERR(__wt_lsm_manager_push_entry(
	    session, WT_LSM_WORK_DROP, 0, lsm_tree));

err:	if (locked)
		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
	if (in_sync)
		(void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
	if (src != NULL)
		WT_TRET(src->close(src));
	if (dest != NULL)
		WT_TRET(dest->close(dest));
	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	if (ret != 0 && created_chunk) {
		/* Drop the newly-created files on error. */
		if (chunk->uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session, tret =
			    __wt_schema_drop(session, chunk->uri, drop_cfg));
			WT_TRET(tret);
		}
		if (create_bloom && chunk->bloom_uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session,
			    tret = __wt_schema_drop(
			    session, chunk->bloom_uri, drop_cfg));
			WT_TRET(tret);
		}
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, chunk);

		if (ret == EINTR)
			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
			    "Merge aborted due to close"));
		else
			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
			    "Merge failed with %s", wiredtiger_strerror(ret)));
	}
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
	return (ret);
}
Exemple #24
0
/*
 * __wt_txn_recover --
 *	Run recovery.
 */
int
__wt_txn_recover(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *metac;
	WT_DECL_RET;
	WT_RECOVERY r;
	WT_RECOVERY_FILE *metafile;
	char *config;
	bool do_checkpoint, eviction_started, needs_rec, was_backup;

	conn = S2C(session);
	WT_CLEAR(r);
	WT_INIT_LSN(&r.ckpt_lsn);
	config = NULL;
	do_checkpoint = true;
	eviction_started = false;
	was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);

	/* We need a real session for recovery. */
	WT_RET(__wt_open_internal_session(conn, "txn-recover",
	    false, WT_SESSION_NO_LOGGING, &session));
	r.session = session;
	WT_MAX_LSN(&r.max_ckpt_lsn);
	WT_MAX_LSN(&r.max_rec_lsn);
	conn->txn_global.recovery_timestamp =
	    conn->txn_global.meta_ckpt_timestamp = 0;

	F_SET(conn, WT_CONN_RECOVERING);
	WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
	WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
	WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac));
	metafile = &r.files[WT_METAFILE_ID];
	metafile->c = metac;

	/*
	 * If no log was found (including if logging is disabled), or if the
	 * last checkpoint was done with logging disabled, recovery should not
	 * run.  Scan the metadata to figure out the largest file ID.
	 */
	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_EXISTED) ||
	    WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
		/*
		 * Detect if we're going from logging disabled to enabled.
		 * We need to know this to verify LSNs and start at the correct
		 * log file later.  If someone ran with logging, then disabled
		 * it and removed all the log files and then turned logging back
		 * on, we have to start logging in the log file number that is
		 * larger than any checkpoint LSN we have from the earlier time.
		 */
		WT_ERR(__recovery_file_scan(&r));
		/*
		 * The array can be re-allocated in recovery_file_scan.  Reset
		 * our pointer after scanning all the files.
		 */
		metafile = &r.files[WT_METAFILE_ID];
		conn->next_file_id = r.max_fileid;

		if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
		    WT_IS_MAX_LSN(&metafile->ckpt_lsn) &&
		    !WT_IS_MAX_LSN(&r.max_ckpt_lsn))
			WT_ERR(__wt_log_reset(session, r.max_ckpt_lsn.l.file));
		else
			do_checkpoint = false;
		goto done;
	}

	/*
	 * First, do a pass through the log to recover the metadata, and
	 * establish the last checkpoint LSN.  Skip this when opening a hot
	 * backup: we already have the correct metadata in that case.
	 *
	 * If we're running with salvage and we hit an error, we ignore it
	 * and continue. In salvage we want to recover whatever part of the
	 * data we can from the last checkpoint up until whatever problem we
	 * detect in the log file. In salvage, we ignore errors from scanning
	 * the log so recovery can continue. Other errors remain errors.
	 */
	if (!was_backup) {
		r.metadata_only = true;
		/*
		 * If this is a read-only connection, check if the checkpoint
		 * LSN in the metadata file is up to date, indicating a clean
		 * shutdown.
		 */
		if (F_ISSET(conn, WT_CONN_READONLY)) {
			WT_ERR(__wt_log_needs_recovery(
			    session, &metafile->ckpt_lsn, &needs_rec));
			if (needs_rec)
				WT_ERR_MSG(session, WT_RUN_RECOVERY,
				    "Read-only database needs recovery");
		}
		if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
			ret = __wt_log_scan(session,
			    NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r);
		else {
			/*
			 * Start at the last checkpoint LSN referenced in the
			 * metadata.  If we see the end of a checkpoint while
			 * scanning, we will change the full scan to start from
			 * there.
			 */
			r.ckpt_lsn = metafile->ckpt_lsn;
			ret = __wt_log_scan(session,
			    &metafile->ckpt_lsn, 0, __txn_log_recover, &r);
		}
		if (F_ISSET(conn, WT_CONN_SALVAGE))
			ret = 0;
		/*
		 * If log scan couldn't find a file we expected to be around,
		 * this indicates a corruption of some sort.
		 */
		if (ret == ENOENT) {
			F_SET(conn, WT_CONN_DATA_CORRUPTION);
			ret = WT_ERROR;
		}

		WT_ERR(ret);
	}

	/* Scan the metadata to find the live files and their IDs. */
	WT_ERR(__recovery_file_scan(&r));
	/*
	 * Clear this out.  We no longer need it and it could have been
	 * re-allocated when scanning the files.
	 */
	WT_NOT_READ(metafile, NULL);

	/*
	 * We no longer need the metadata cursor: close it to avoid pinning any
	 * resources that could block eviction during recovery.
	 */
	r.files[0].c = NULL;
	WT_ERR(metac->close(metac));

	/*
	 * Now, recover all the files apart from the metadata.
	 * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
	 */
	r.metadata_only = false;
	__wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS,
	    "Main recovery loop: starting at %" PRIu32 "/%" PRIu32
	    " to %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset,
	    r.max_rec_lsn.l.file, r.max_rec_lsn.l.offset);
	WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
	/*
	 * Check if the database was shut down cleanly.  If not
	 * return an error if the user does not want automatic
	 * recovery.
	 */
	if (needs_rec &&
	    (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) ||
	     F_ISSET(conn, WT_CONN_READONLY))) {
		if (F_ISSET(conn, WT_CONN_READONLY))
			WT_ERR_MSG(session, WT_RUN_RECOVERY,
			    "Read-only database needs recovery");
		WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery");
	}

	if (F_ISSET(conn, WT_CONN_READONLY)) {
		do_checkpoint = false;
		goto done;
	}

	/*
	 * Recovery can touch more data than fits in cache, so it relies on
	 * regular eviction to manage paging.  Start eviction threads for
	 * recovery without LAS cursors.
	 */
	WT_ERR(__wt_evict_create(session));
	eviction_started = true;

	/*
	 * Always run recovery even if it was a clean shutdown only if
	 * this is not a read-only connection.
	 * We can consider skipping it in the future.
	 */
	if (needs_rec)
		FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);
	if (WT_IS_INIT_LSN(&r.ckpt_lsn))
		ret = __wt_log_scan(session, NULL,
		    WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
		    __txn_log_recover, &r);
	else
		ret = __wt_log_scan(session, &r.ckpt_lsn,
		    WT_LOGSCAN_RECOVER, __txn_log_recover, &r);
	if (F_ISSET(conn, WT_CONN_SALVAGE))
		ret = 0;
	WT_ERR(ret);

	conn->next_file_id = r.max_fileid;

done:	WT_ERR(__recovery_set_checkpoint_timestamp(&r));
	if (do_checkpoint)
		/*
		 * Forcibly log a checkpoint so the next open is fast and keep
		 * the metadata up to date with the checkpoint LSN and
		 * archiving.
		 */
		WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));

	/*
	 * If we're downgrading and have newer log files, force an archive,
	 * no matter what the archive setting is.
	 */
	if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE))
		WT_ERR(__wt_log_truncate_files(session, NULL, true));
	FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);

err:	WT_TRET(__recovery_free(&r));
	__wt_free(session, config);
	FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);

	if (ret != 0) {
		FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_FAILED);
		__wt_err(session, ret, "Recovery failed");
	}

	/*
	 * Destroy the eviction threads that were started in support of
	 * recovery.  They will be restarted once the lookaside table is
	 * created.
	 */
	if (eviction_started)
		WT_TRET(__wt_evict_destroy(session));

	WT_TRET(session->iface.close(&session->iface, NULL));
	F_CLR(conn, WT_CONN_RECOVERING);

	return (ret);
}
Exemple #25
0
/*
 * __wt_lsm_meta_read --
 *	Read the metadata for an LSM tree.
 */
int
__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_CONFIG cparser, lparser;
	WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	char *lsmconfig;
	u_int nchunks;

	chunk = NULL;			/* -Wconditional-uninitialized */

	WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig));
	WT_ERR(__wt_config_init(session, &cparser, lsmconfig));
	while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
		if (WT_STRING_MATCH("key_format", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->key_format);
			WT_ERR(__wt_strndup(session,
			    cv.str, cv.len, &lsm_tree->key_format));
		} else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->value_format);
			WT_ERR(__wt_strndup(session,
			    cv.str, cv.len, &lsm_tree->value_format));
		} else if (WT_STRING_MATCH("collator", ck.str, ck.len)) {
			if (cv.len == 0 ||
			    WT_STRING_CASE_MATCH("none", cv.str, cv.len))
				continue;
			/*
			 * Extract the application-supplied metadata (if any)
			 * from the file configuration.
			 */
			WT_ERR(__wt_config_getones(
			    session, lsmconfig, "file_config", &fileconf));
			WT_CLEAR(metadata);
			WT_ERR_NOTFOUND_OK(__wt_config_subgets(
			    session, &fileconf, "app_metadata", &metadata));
			WT_ERR(__wt_collator_config(session, lsm_tree->name,
			    &cv, &metadata,
			    &lsm_tree->collator, &lsm_tree->collator_owned));
			WT_ERR(__wt_strndup(session,
			    cv.str, cv.len, &lsm_tree->collator_name));
		} else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->bloom_config);
			/* Don't include the brackets. */
			WT_ERR(__wt_strndup(session,
			    cv.str + 1, cv.len - 2, &lsm_tree->bloom_config));
		} else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->file_config);
			/* Don't include the brackets. */
			WT_ERR(__wt_strndup(session,
			    cv.str + 1, cv.len - 2, &lsm_tree->file_config));
		} else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) {
			if (cv.val)
				F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
			else
				F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
		} else if (WT_STRING_MATCH("bloom", ck.str, ck.len))
			lsm_tree->bloom = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len))
			lsm_tree->bloom_bit_count = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len))
			lsm_tree->bloom_hash_count = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len))
			lsm_tree->chunk_max = (uint64_t)cv.val;
		else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len))
			lsm_tree->chunk_size = (uint64_t)cv.val;
		else if (WT_STRING_MATCH("merge_max", ck.str, ck.len))
			lsm_tree->merge_max = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("merge_min", ck.str, ck.len))
			lsm_tree->merge_min = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("last", ck.str, ck.len))
			lsm_tree->last = (u_int)cv.val;
		else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) {
			WT_ERR(__wt_config_subinit(session, &lparser, &cv));
			for (nchunks = 0; (ret =
			    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
				if (WT_STRING_MATCH("id", lk.str, lk.len)) {
					WT_ERR(__wt_realloc_def(session,
					    &lsm_tree->chunk_alloc,
					    nchunks + 1, &lsm_tree->chunk));
					WT_ERR(
					    __wt_calloc_one(session, &chunk));
					lsm_tree->chunk[nchunks++] = chunk;
					chunk->id = (uint32_t)lv.val;
					WT_ERR(__wt_lsm_tree_chunk_name(session,
					    lsm_tree, chunk->id, &chunk->uri));
					F_SET(chunk,
					    WT_LSM_CHUNK_ONDISK |
					    WT_LSM_CHUNK_STABLE);
				} else if (WT_STRING_MATCH(
				    "bloom", lk.str, lk.len)) {
					WT_ERR(__wt_lsm_tree_bloom_name(
					    session, lsm_tree,
					    chunk->id, &chunk->bloom_uri));
					F_SET(chunk, WT_LSM_CHUNK_BLOOM);
					continue;
				} else if (WT_STRING_MATCH(
				    "chunk_size", lk.str, lk.len)) {
					chunk->size = (uint64_t)lv.val;
					continue;
				} else if (WT_STRING_MATCH(
				    "count", lk.str, lk.len)) {
					chunk->count = (uint64_t)lv.val;
					continue;
				} else if (WT_STRING_MATCH(
				    "generation", lk.str, lk.len)) {
					chunk->generation = (uint32_t)lv.val;
					continue;
				}
			}
			WT_ERR_NOTFOUND_OK(ret);
			lsm_tree->nchunks = nchunks;
		} else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) {
			WT_ERR(__wt_config_subinit(session, &lparser, &cv));
			for (nchunks = 0; (ret =
			    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
				if (WT_STRING_MATCH("bloom", lk.str, lk.len)) {
					WT_ERR(__wt_strndup(session,
					    lv.str, lv.len, &chunk->bloom_uri));
					F_SET(chunk, WT_LSM_CHUNK_BLOOM);
					continue;
				}
				WT_ERR(__wt_realloc_def(session,
				    &lsm_tree->old_alloc, nchunks + 1,
				    &lsm_tree->old_chunks));
				WT_ERR(__wt_calloc_one(session, &chunk));
				lsm_tree->old_chunks[nchunks++] = chunk;
				WT_ERR(__wt_strndup(session,
				    lk.str, lk.len, &chunk->uri));
				F_SET(chunk, WT_LSM_CHUNK_ONDISK);
			}
			WT_ERR_NOTFOUND_OK(ret);
			lsm_tree->nold_chunks = nchunks;
		/* Values included for backward compatibility */
		} else if (WT_STRING_MATCH("merge_threads", ck.str, ck.len)) {
		} else
			WT_ERR(__wt_illegal_value(session, "LSM metadata"));
	}
	WT_ERR_NOTFOUND_OK(ret);

	/*
	 * If the default merge_min was not overridden, calculate it now.  We
	 * do this here so that trees created before merge_min was added get a
	 * sane value.
	 */
	if (lsm_tree->merge_min < 2)
		lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2);

err:	__wt_free(session, lsmconfig);
	return (ret);
}
Exemple #26
0
/*
 * __conn_dhandle_get --
 *	Allocate a new data handle, lock it exclusively, and return it linked
 *	into the connection's list.
 */
static int
__conn_dhandle_get(WT_SESSION_IMPL *session,
    const char *name, const char *ckpt, uint32_t flags)
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DATA_HANDLE *dhandle;
	WT_DECL_RET;
	uint32_t bucket;

	conn = S2C(session);

	/*
	 * We have the handle lock, check whether we can find the handle we
	 * are looking for.  If we do, and we can lock it in the state we
	 * want, this session will take ownership and we are done.
	 */
	ret = __wt_conn_dhandle_find(session, name, ckpt, flags);
	if (ret == 0) {
		dhandle = session->dhandle;
		WT_RET(__conn_dhandle_open_lock(session, dhandle, flags));
		return (0);
	}
	WT_RET_NOTFOUND_OK(ret);

	/*
	 * If no handle was found, allocate the data handle and a btree handle,
	 * then initialize the data handle.  Exclusively lock the data handle
	 * before inserting it in the list.
	 */
	WT_RET(__wt_calloc_one(session, &dhandle));

	WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle"));

	dhandle->name_hash = __wt_hash_city64(name, strlen(name));
	WT_ERR(__wt_strdup(session, name, &dhandle->name));
	if (ckpt != NULL)
		WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint));

	WT_ERR(__wt_calloc_one(session, &btree));
	dhandle->handle = btree;
	btree->dhandle = dhandle;

	WT_ERR(__wt_spin_init(
	    session, &dhandle->close_lock, "data handle close"));

	F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
	WT_ERR(__wt_writelock(session, dhandle->rwlock));

	/*
	 * Prepend the handle to the connection list, assuming we're likely to
	 * need new files again soon, until they are cached by all sessions.
	 * Find the right hash bucket to insert into as well.
	 */
	WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
	bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
	WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket);

	session->dhandle = dhandle;
	return (0);

err:	WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
	__wt_free(session, dhandle->name);
	__wt_free(session, dhandle->checkpoint);
	__wt_free(session, dhandle->handle);		/* btree free */
	__wt_spin_destroy(session, &dhandle->close_lock);
	__wt_overwrite_and_free(session, dhandle);

	return (ret);
}
Exemple #27
0
/*
 * __sweep --
 *	Close unused dhandles on the connection dhandle list.
 */
static int
__sweep(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DATA_HANDLE *dhandle, *dhandle_next;
	WT_DECL_RET;
	time_t now;
	int locked;

	conn = S2C(session);

	/* Don't discard handles that have been open recently. */
	WT_RET(__wt_seconds(session, &now));

	WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
	dhandle = SLIST_FIRST(&conn->dhlh);
	for (; dhandle != NULL; dhandle = dhandle_next) {
		dhandle_next = SLIST_NEXT(dhandle, l);
		if (WT_IS_METADATA(dhandle))
			continue;
		if (dhandle->session_inuse != 0 ||
		    now <= dhandle->timeofdeath + WT_DHANDLE_SWEEP_WAIT)
			continue;
		if (dhandle->timeofdeath == 0) {
			dhandle->timeofdeath = now;
			WT_STAT_FAST_CONN_INCR(session, dh_conn_tod);
			continue;
		}

		/*
		 * We have a candidate for closing; if it's open, acquire an
		 * exclusive lock on the handle and close it. We might be
		 * blocking opens for a long time (over disk I/O), but the
		 * handle was quiescent for awhile.
		 *
		 * The close can fail if an update cannot be written (updates
		 * in a no-longer-referenced file might not yet be globally
		 * visible if sessions have disjoint sets of files open).  If
		 * the handle is busy, skip it, we'll retry the close the next
		 * time, after the transaction state has progressed.
		 *
		 * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we want
		 * opens to block on us rather than returning an EBUSY error to
		 * the application.
		 */
		if ((ret =
		    __wt_try_writelock(session, dhandle->rwlock)) == EBUSY)
			continue;
		WT_RET(ret);
		locked = 1;

		/* If the handle is open, try to close it. */
		if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
			WT_WITH_DHANDLE(session, dhandle,
			    ret = __wt_conn_btree_sync_and_close(session, 0));
			if (ret != 0)
				goto unlock;

			/* We closed the btree handle, bump the statistic. */
			WT_STAT_FAST_CONN_INCR(session, dh_conn_handles);
		}

		/*
		 * If there are no longer any references to the handle in any
		 * sessions, attempt to discard it.  The called function
		 * re-checks that the handle is not in use, which is why we
		 * don't do any special handling of EBUSY returns above.
		 */
		if (dhandle->session_inuse == 0 && dhandle->session_ref == 0) {
			WT_WITH_DHANDLE(session, dhandle,
			    ret = __wt_conn_dhandle_discard_single(session, 0));
			if (ret != 0)
				goto unlock;

			/* If the handle was discarded, it isn't locked. */
			locked = 0;
		} else
			WT_STAT_FAST_CONN_INCR(session, dh_conn_ref);

unlock:		if (locked)
			WT_TRET(__wt_writeunlock(session, dhandle->rwlock));

		WT_RET_BUSY_OK(ret);
	}
	return (0);
}
Exemple #28
0
/*
 * __conn_btree_open --
 *	Open the current btree handle.
 */
static int
__conn_btree_open(
	WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags)
{
	WT_BTREE *btree;
	WT_DATA_HANDLE *dhandle;
	WT_DECL_RET;

	dhandle = session->dhandle;
	btree = S2BT(session);

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
	    F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) &&
	    !LF_ISSET(WT_DHANDLE_LOCK_ONLY));

	/*
	 * If the handle is already open, it has to be closed so it can be
	 * reopened with a new configuration.  We don't need to check again:
	 * this function isn't called if the handle is already open in the
	 * required mode.
	 *
	 * This call can return EBUSY if there's an update in the object that's
	 * not yet globally visible.  That's not a problem because it can only
	 * happen when we're switching from a normal handle to a "special" one,
	 * so we're returning EBUSY to an attempt to verify or do other special
	 * operations.  The reverse won't happen because when the handle from a
	 * verify or other special operation is closed, there won't be updates
	 * in the tree that can block the close.
	 */
	if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
		WT_RET(__wt_conn_btree_sync_and_close(session, 0));

	/* Discard any previous configuration, set up the new configuration. */
	__conn_btree_config_clear(session);
	WT_RET(__conn_btree_config_set(session));

	/* Set any special flags on the handle. */
	F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS));

	do {
		WT_ERR(__wt_btree_open(session, cfg));
		F_SET(dhandle, WT_DHANDLE_OPEN);
		/*
		 * Checkpoint handles are read only, so eviction calculations
		 * based on the number of btrees are better to ignore them.
		 */
		if (dhandle->checkpoint == NULL)
			++S2C(session)->open_btree_count;

		/* Drop back to a readlock if that is all that was needed. */
		if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
			F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
			WT_ERR(__wt_writeunlock(session, dhandle->rwlock));
			WT_ERR(
			    __conn_dhandle_open_lock(session, dhandle, flags));
		}
	} while (!F_ISSET(dhandle, WT_DHANDLE_OPEN));

	if (0) {
err:		F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
		/* If the open failed, close the handle. */
		if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
			WT_TRET(__wt_conn_btree_sync_and_close(session, 0));
	}

	return (ret);
}
Exemple #29
0
/*
 * __txn_op_log --
 *	Log an operation for the current transaction.
 */
static int
__txn_op_log(WT_SESSION_IMPL *session,
    WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt)
{
	WT_CURSOR *cursor;
	WT_ITEM value;
	WT_UPDATE *upd;
	uint64_t recno;

	cursor = &cbt->iface;

	upd = op->u.upd;
	value.data = upd->data;
	value.size = upd->size;

	/*
	 * Log the row- or column-store insert, modify, remove or update. Our
	 * caller doesn't log reserve operations, we shouldn't see them here.
	 */
	if (cbt->btree->type == BTREE_ROW) {
#ifdef HAVE_DIAGNOSTIC
		__txn_op_log_row_key_check(session, cbt);
#endif
		switch (upd->type) {
		case WT_UPDATE_DELETED:
			WT_RET(__wt_logop_row_remove_pack(
			    session, logrec, op->fileid, &cursor->key));
			break;
		case WT_UPDATE_MODIFIED:
			WT_RET(__wt_logop_row_modify_pack(
			    session, logrec, op->fileid, &cursor->key, &value));
			break;
		case WT_UPDATE_STANDARD:
			WT_RET(__wt_logop_row_put_pack(
			    session, logrec, op->fileid, &cursor->key, &value));
			break;
		WT_ILLEGAL_VALUE(session);
		}
	} else {
		recno = WT_INSERT_RECNO(cbt->ins);
		WT_ASSERT(session, recno != WT_RECNO_OOB);

		switch (upd->type) {
		case WT_UPDATE_DELETED:
			WT_RET(__wt_logop_col_remove_pack(
			    session, logrec, op->fileid, recno));
			break;
		case WT_UPDATE_MODIFIED:
			WT_RET(__wt_logop_col_modify_pack(
			    session, logrec, op->fileid, recno, &value));
			break;
		case WT_UPDATE_STANDARD:
			WT_RET(__wt_logop_col_put_pack(
			    session, logrec, op->fileid, recno, &value));
			break;
		WT_ILLEGAL_VALUE(session);
		}
	}

	return (0);
}
Exemple #30
0
/*
 * __wt_page_inmem --
 *	Build in-memory page information.
 */
int
__wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref,
    const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep)
{
	WT_DECL_RET;
	WT_PAGE *page;
	const WT_PAGE_HEADER *dsk;
	uint32_t alloc_entries;
	size_t size;

	*pagep = NULL;

	dsk = image;
	alloc_entries = 0;

	/*
	 * Figure out how many underlying objects the page references so we can
	 * allocate them along with the page.
	 */
	switch (dsk->type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_COL_VAR:
		/*
		 * Column-store leaf page entries map one-to-one to the number
		 * of physical entries on the page (each physical entry is a
		 * value item).
		 *
		 * Column-store internal page entries map one-to-one to the
		 * number of physical entries on the page (each entry is a
		 * location cookie).
		 */
		alloc_entries = dsk->u.entries;
		break;
	case WT_PAGE_ROW_INT:
		/*
		 * Row-store internal page entries map one-to-two to the number
		 * of physical entries on the page (each entry is a key and
		 * location cookie pair).
		 */
		alloc_entries = dsk->u.entries / 2;
		break;
	case WT_PAGE_ROW_LEAF:
		/*
		 * If the "no empty values" flag is set, row-store leaf page
		 * entries map one-to-one to the number of physical entries
		 * on the page (each physical entry is a key or value item).
		 * If that flag is not set, there are more keys than values,
		 * we have to walk the page to figure it out.
		 */
		if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL))
			alloc_entries = dsk->u.entries;
		else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
			alloc_entries = dsk->u.entries / 2;
		else
			WT_RET(__inmem_row_leaf_entries(
			    session, dsk, &alloc_entries));
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* Allocate and initialize a new WT_PAGE. */
	WT_RET(__wt_page_alloc(
	    session, dsk->type, dsk->recno, alloc_entries, 1, &page));
	page->dsk = dsk;
	F_SET_ATOMIC(page, flags);

	/*
	 * Track the memory allocated to build this page so we can update the
	 * cache statistics in a single call. If the disk image is in allocated
	 * memory, start with that.
	 */
	size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? memsize : 0;

	switch (page->type) {
	case WT_PAGE_COL_FIX:
		__inmem_col_fix(session, page);
		break;
	case WT_PAGE_COL_INT:
		__inmem_col_int(session, page);
		break;
	case WT_PAGE_COL_VAR:
		WT_ERR(__inmem_col_var(session, page, &size));
		break;
	case WT_PAGE_ROW_INT:
		WT_ERR(__inmem_row_int(session, page, &size));
		break;
	case WT_PAGE_ROW_LEAF:
		WT_ERR(__inmem_row_leaf(session, page));
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

	/* Update the page's in-memory size and the cache statistics. */
	__wt_cache_page_inmem_incr(session, page, size);

	/* Link the new internal page to the parent. */
	if (ref != NULL) {
		switch (page->type) {
		case WT_PAGE_COL_INT:
		case WT_PAGE_ROW_INT:
			page->pg_intl_parent_ref = ref;
			break;
		}
		ref->page = page;
	}

	*pagep = page;
	return (0);

err:	__wt_page_out(session, &page);
	return (ret);
}