Example #1
0
/*
 * __wt_bt_read --
 *	Read a cookie referenced block into a buffer.
 */
int
__wt_bt_read(WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_PAGE_HEADER *dsk;
	size_t result_len;

	btree = session->btree;
	bm = btree->bm;

	/*
	 * If anticipating a compressed block, read into a scratch buffer and
	 * decompress into the caller's buffer.  Else, read directly into the
	 * caller's buffer.
	 */
	if (btree->compressor == NULL) {
		WT_RET(bm->read(bm, session, buf, addr, addr_size));
		dsk = buf->mem;
	} else {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
		dsk = tmp->mem;
	}

	/*
	 * If the block is compressed, copy the skipped bytes of the original
	 * image into place, then decompress.
	 */
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
		if (btree->compressor == NULL ||
		    btree->compressor->decompress == NULL)
			WT_ERR_MSG(session, WT_ERROR,
			    "read compressed block where no compression engine "
			    "configured");

		/*
		 * We're allocating the exact number of bytes we're expecting
		 * from decompression.
		 */
		WT_ERR(__wt_buf_init(session, buf, dsk->mem_size));
		buf->size = dsk->mem_size;

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));

		/*
		 * If checksums were turned off because we're depending on the
		 * decompression to fail on any corrupted data, we'll end up
		 * here after corruption happens.  If we're salvaging the file,
		 * it's OK, otherwise it's really, really bad.
		 */
		if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
			WT_ERR(
			    F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR) ?
			    WT_ERROR :
			    __wt_illegal_value(session, btree->name));
	} else
		if (btree->compressor == NULL)
			buf->size = dsk->mem_size;
		else
			/*
			 * We guessed wrong: there was a compressor, but this
			 * block was not compressed, and now the page is in the
			 * wrong buffer and the buffer may be of the wrong size.
			 * This should be rare, but happens with small blocks
			 * that aren't worth compressing.
			 */
			WT_ERR(__wt_buf_set(
			    session, buf, tmp->data, dsk->mem_size));

	/* If the handle is a verify handle, verify the physical page. */
	if (F_ISSET(btree, WT_BTREE_VERIFY)) {
		if (tmp == NULL)
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
		WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
	}

	WT_CSTAT_INCR(session, cache_read);
	WT_DSTAT_INCR(session, cache_read);
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
		WT_DSTAT_INCR(session, compress_read);
	WT_CSTAT_INCRV(session, cache_bytes_read, addr_size);
	WT_DSTAT_INCRV(session, cache_bytes_read, addr_size);

err:	__wt_scr_free(&tmp);
	return (ret);
}
Example #2
0
/*
 * __conn_config_file --
 *	Read in any WiredTiger_config file in the home directory.
 */
static int
__conn_config_file(WT_SESSION_IMPL *session, const char **cfg, WT_ITEM **cbufp)
{
	WT_DECL_ITEM(cbuf);
	WT_DECL_RET;
	WT_FH *fh;
	off_t size;
	uint32_t len;
	int exist, quoted;
	uint8_t *p, *t;

	*cbufp = NULL;				/* Returned buffer */

	fh = NULL;

	/* Check for an optional configuration file. */
#define	WT_CONFIGFILE	"WiredTiger.config"
	WT_RET(__wt_exist(session, WT_CONFIGFILE, &exist));
	if (!exist)
		return (0);

	/* Open the configuration file. */
	WT_RET(__wt_open(session, WT_CONFIGFILE, 0, 0, 0, &fh));
	WT_ERR(__wt_filesize(session, fh, &size));
	if (size == 0)
		goto err;

	/*
	 * Sanity test: a 100KB configuration file would be insane.  (There's
	 * no practical reason to limit the file size, but I can either limit
	 * the file size to something rational, or I can add code to test if
	 * the off_t size is larger than a uint32_t, which is more complicated
	 * and a waste of time.)
	 */
	if (size > 100 * 1024)
		WT_ERR_MSG(session, EFBIG, WT_CONFIGFILE);
	len = (uint32_t)size;

	/*
	 * Copy the configuration file into memory, with a little slop, I'm not
	 * interested in debugging off-by-ones.
	 *
	 * The beginning of a file is the same as if we run into an unquoted
	 * newline character, simplify the parsing loop by pretending that's
	 * what we're doing.
	 */
	WT_ERR(__wt_scr_alloc(session, len + 10,  &cbuf));
	WT_ERR(
	    __wt_read(session, fh, (off_t)0, len, ((uint8_t *)cbuf->mem) + 1));
	((uint8_t *)cbuf->mem)[0] = '\n';
	cbuf->size = len + 1;

	/*
	 * Collapse the file's lines into a single string: newline characters
	 * are replaced with commas unless the newline is quoted or backslash
	 * escaped.  Comment lines (an unescaped newline where the next non-
	 * white-space character is a hash), are discarded.
	 */
	for (quoted = 0, p = t = cbuf->mem; len > 0;) {
		/*
		 * Backslash pairs pass through untouched, unless immediately
		 * preceding a newline, in which case both the backslash and
		 * the newline are discarded.  Backslash characters escape
		 * quoted characters, too, that is, a backslash followed by a
		 * quote doesn't start or end a quoted string.
		 */
		if (*p == '\\' && len > 1) {
			if (p[1] != '\n') {
				*t++ = p[0];
				*t++ = p[1];
			}
			p += 2;
			len -= 2;
			continue;
		}

		/*
		 * If we're in a quoted string, or starting a quoted string,
		 * take all characters, including white-space and newlines.
		 */
		if (quoted || *p == '"') {
			if (*p == '"')
				quoted = !quoted;
			*t++ = *p++;
			--len;
			continue;
		}

		/* Everything else gets taken, except for newline characters. */
		if (*p != '\n') {
			*t++ = *p++;
			--len;
			continue;
		}

		/*
		 * Replace any newline characters with commas (and strings of
		 * commas are safe).
		 *
		 * After any newline, skip to a non-white-space character; if
		 * the next character is a hash mark, skip to the next newline.
		 */
		for (;;) {
			for (*t++ = ','; --len > 0 && isspace(*++p);)
				;
			if (len == 0)
				break;
			if (*p != '#')
				break;
			while (--len > 0 && *++p != '\n')
				;
			if (len == 0)
				break;
		}
	}
	*t = '\0';

#if 0
	fprintf(stderr, "file config: {%s}\n", (const char *)cbuf->data);
#endif

	/* Check the configuration string. */
	WT_ERR(__wt_config_check(
	    session, __wt_confchk_wiredtiger_open, cbuf->data, 0));

	/*
	 * The configuration file falls between the default configuration and
	 * the wiredtiger_open() configuration, overriding the defaults but not
	 * overriding the wiredtiger_open() configuration.
	 */
	while (cfg[1] != NULL)
		++cfg;
	cfg[1] = cfg[0];
	cfg[0] = cbuf->data;

	*cbufp = cbuf;

	if (0) {
err:		if (cbuf != NULL)
			__wt_buf_free(session, cbuf);
	}
	if (fh != NULL)
		WT_TRET(__wt_close(session, fh));
	return (ret);
}
Example #3
0
/*
 * __las_page_instantiate --
 *	Instantiate lookaside update records in a recently read page.
 */
static int
__las_page_instantiate(WT_SESSION_IMPL *session,
    WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size)
{
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE cbt;
	WT_DECL_ITEM(current_key);
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_ITEM(las_value);
	WT_DECL_RET;
	WT_PAGE *page;
	WT_UPDATE *first_upd, *last_upd, *upd;
	size_t incr, total_incr;
	uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
	uint32_t las_id, upd_size, session_flags;
	int exact;
	const uint8_t *p;

	cursor = NULL;
	page = ref->page;
	first_upd = last_upd = upd = NULL;
	total_incr = 0;
	current_recno = recno = WT_RECNO_OOB;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	__wt_btcur_init(session, &cbt);
	__wt_btcur_open(&cbt);

	WT_ERR(__wt_scr_alloc(session, 0, &current_key));
	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));
	WT_ERR(__wt_scr_alloc(session, 0, &las_value));

	/* Open a lookaside table cursor. */
	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * The lookaside records are in key and update order, that is, there
	 * will be a set of in-order updates for a key, then another set of
	 * in-order updates for a subsequent key. We process all of the updates
	 * for a key and then insert those updates into the page, then all the
	 * updates for the next key, and so on.
	 *
	 * Search for the block's unique prefix, stepping through any matching
	 * records.
	 */
	las_addr->data = addr;
	las_addr->size = addr_size;
	las_key->size = 0;
	cursor->set_key(
	    cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
	if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
		ret = cursor->next(cursor);
	for (; ret == 0; ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * Confirm the search using the unique prefix; if not a match,
		 * we're done searching for records for this page.
		 */
		if (las_id != read_id ||
		    las_addr->size != addr_size ||
		    memcmp(las_addr->data, addr, addr_size) != 0)
			break;

		/*
		 * If the on-page value has become globally visible, this record
		 * is no longer needed.
		 */
		if (__wt_txn_visible_all(session, las_txnid))
			continue;

		/* Allocate the WT_UPDATE structure. */
		WT_ERR(cursor->get_value(
		    cursor, &upd_txnid, &upd_size, las_value));
		WT_ERR(__wt_update_alloc(session,
		    (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value,
		    &upd, &incr));
		total_incr += incr;
		upd->txnid = upd_txnid;

		switch (page->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			p = las_key->data;
			WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
			if (current_recno == recno)
				break;
			WT_ASSERT(session, current_recno < recno);

			if (first_upd != NULL) {
				WT_ERR(__col_instantiate(session,
				    current_recno, ref, &cbt, first_upd));
				first_upd = NULL;
			}
			current_recno = recno;
			break;
		case WT_PAGE_ROW_LEAF:
			if (current_key->size == las_key->size &&
			    memcmp(current_key->data,
			    las_key->data, las_key->size) == 0)
				break;

			if (first_upd != NULL) {
				WT_ERR(__row_instantiate(session,
				    current_key, ref, &cbt, first_upd));
				first_upd = NULL;
			}
			WT_ERR(__wt_buf_set(session,
			    current_key, las_key->data, las_key->size));
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}

		/* Append the latest update to the list. */
		if (first_upd == NULL)
			first_upd = last_upd = upd;
		else {
			last_upd->next = upd;
			last_upd = upd;
		}
		upd = NULL;
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* Insert the last set of updates, if any. */
	if (first_upd != NULL)
		switch (page->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			WT_ERR(__col_instantiate(session,
			    current_recno, ref, &cbt, first_upd));
			first_upd = NULL;
			break;
		case WT_PAGE_ROW_LEAF:
			WT_ERR(__row_instantiate(session,
			    current_key, ref, &cbt, first_upd));
			first_upd = NULL;
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}

	/* Discard the cursor. */
	WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));

	if (total_incr != 0) {
		__wt_cache_page_inmem_incr(session, page, total_incr);

		/*
		 * We've modified/dirtied the page, but that's not necessary and
		 * if we keep the page clean, it's easier to evict. We leave the
		 * lookaside table updates in place, so if we evict this page
		 * without dirtying it, any future instantiation of it will find
		 * the records it needs. If the page is dirtied before eviction,
		 * then we'll write any needed lookaside table records for the
		 * new location of the page.
		 */
		__wt_page_modify_clear(session, page);
	}

err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
	WT_TRET(__wt_btcur_close(&cbt, 1));

	/*
	 * On error, upd points to a single unlinked WT_UPDATE structure,
	 * first_upd points to a list.
	 */
	if (upd != NULL)
		__wt_free(session, upd);
	if (first_upd != NULL)
		__wt_free_update_list(session, first_upd);

	__wt_scr_free(session, &current_key);
	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);
	__wt_scr_free(session, &las_value);

	return (ret);
}
Example #4
0
/*重命名一个索引或者colgroup*/
static int __rename_tree(WT_SESSION_IMPL* session, WT_TABLE* table, const char *newuri, const char *name, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_DECL_ITEM(nn);
	WT_DECL_ITEM(ns);
	WT_DECL_ITEM(nv);
	WT_DECL_ITEM(os);
	WT_DECL_RET;
	const char *newname, *olduri, *suffix;
	char *value;
	int is_colgroup;

	olduri = table->name;
	value = NULL;

	newname = newuri;
	(void)WT_PREFIX_SKIP(newname, "table:");

	/*判断是重命名一个colgroup*/
	is_colgroup = WT_PREFIX_MATCH(name, "colgroup:");
	if (!is_colgroup && !WT_PREFIX_MATCH(name, "index:"))
		WT_ERR_MSG(session, EINVAL, "expected a 'colgroup:' or 'index:' source: '%s'", name);

	suffix = strchr(name, ':');
	/* An existing table should have a well formed name. */
	WT_ASSERT(session, suffix != NULL);
	suffix = strchr(suffix + 1, ':');

	WT_ERR(__wt_scr_alloc(session, 0, &nn));
	WT_ERR(__wt_buf_fmt(session, nn, "%s%s%s", is_colgroup ? "colgroup:" : "index:", newname, (suffix == NULL) ? "" : suffix));

	/* Skip the colon, if any. */
	if (suffix != NULL)
		++suffix;

	WT_ERR(__wt_metadata_search(session, name, &value));

	/*
	 * Calculate the new data source URI.  Use the existing table structure
	 * and substitute the new name temporarily.
	 */
	WT_ERR(__wt_scr_alloc(session, 0, &ns));
	table->name = newuri;
	if (is_colgroup)
		WT_ERR(__wt_schema_colgroup_source(session, table, suffix, value, ns));
	else
		WT_ERR(__wt_schema_index_source(session, table, suffix, value, ns));

	if ((ret = __wt_config_getones(session, value, "source", &cval)) != 0)
		WT_ERR_MSG(session, EINVAL, "index or column group has no data source: %s", value);

	/* Take a copy of the old data source. */
	WT_ERR(__wt_scr_alloc(session, 0, &os));
	WT_ERR(__wt_buf_fmt(session, os, "%.*s", (int)cval.len, cval.str));

	/* Overwrite it with the new data source. */
	WT_ERR(__wt_scr_alloc(session, 0, &nv));
	WT_ERR(__wt_buf_fmt(session, nv, "%.*s%s%s", (int)WT_PTRDIFF(cval.str, value), value, (const char *)ns->data, cval.str + cval.len));

	/*
	 * Remove the old metadata entry.
	 * Insert the new metadata entry.
	 */
	WT_ERR(__wt_metadata_remove(session, name));
	WT_ERR(__wt_metadata_insert(session, nn->data, nv->data));

	/* Rename the file. */
	WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg));

err:
	__wt_scr_free(session, &nn);
	__wt_scr_free(session, &ns);
	__wt_scr_free(session, &nv);
	__wt_scr_free(session, &os);
	__wt_free(session, value);
	table->name = olduri;

	return ret;
}
Example #5
0
File: bt_io.c Project: qihsh/mongo
/*
 * __wt_bt_write --
 *	Write a buffer into a block, returning the block's addr/size and
 * checksum.
 */
int
__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
    uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_ITEM(ctmp);
	WT_DECL_ITEM(etmp);
	WT_DECL_RET;
	WT_KEYED_ENCRYPTOR *kencryptor;
	WT_ITEM *ip;
	WT_PAGE_HEADER *dsk;
	size_t dst_len, len, result_len, size, src_len;
	int compression_failed, data_cksum, encrypted;
	uint8_t *dst, *src;

	btree = S2BT(session);
	bm = btree->bm;
	encrypted = 0;

	/* Checkpoint calls are different than standard calls. */
	WT_ASSERT(session,
	    (checkpoint == 0 && addr != NULL && addr_sizep != NULL) ||
	    (checkpoint == 1 && addr == NULL && addr_sizep == NULL));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * We're passed a table's disk image.  Decompress if necessary and
	 * verify the image.  Always check the in-memory length for accuracy.
	 */
	dsk = buf->mem;
	if (compressed) {
		WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &ctmp));

		memcpy(ctmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP,
		    buf->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)ctmp->data + WT_BLOCK_COMPRESS_SKIP,
		    ctmp->memsize - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));
		WT_ASSERT(session,
		    dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP);
		ctmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
		ip = ctmp;
	} else {
		WT_ASSERT(session, dsk->mem_size == buf->size);
		ip = buf;
	}
	WT_ERR(__wt_verify_dsk(session, "[write-check]", ip));
	__wt_scr_free(session, &ctmp);
#endif

	/*
	 * Optionally stream-compress the data, but don't compress blocks that
	 * are already as small as they're going to get.
	 */
	if (btree->compressor == NULL ||
	    btree->compressor->compress == NULL || compressed)
		ip = buf;
	else if (buf->size <= btree->allocsize) {
		ip = buf;
		WT_STAT_FAST_DATA_INCR(session, compress_write_too_small);
	} else {
		/* Skip the header bytes of the source data. */
		src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
		src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;

		/*
		 * Compute the size needed for the destination buffer.  We only
		 * allocate enough memory for a copy of the original by default,
		 * if any compressed version is bigger than the original, we
		 * won't use it.  However, some compression engines (snappy is
		 * one example), may need more memory because they don't stop
		 * just because there's no more memory into which to compress.
		 */
		if (btree->compressor->pre_size == NULL)
			len = src_len;
		else
			WT_ERR(btree->compressor->pre_size(btree->compressor,
			    &session->iface, src, src_len, &len));

		size = len + WT_BLOCK_COMPRESS_SKIP;
		WT_ERR(bm->write_size(bm, session, &size));
		WT_ERR(__wt_scr_alloc(session, size, &ctmp));

		/* Skip the header bytes of the destination data. */
		dst = (uint8_t *)ctmp->mem + WT_BLOCK_COMPRESS_SKIP;
		dst_len = len;

		compression_failed = 0;
		WT_ERR(btree->compressor->compress(btree->compressor,
		    &session->iface,
		    src, src_len,
		    dst, dst_len,
		    &result_len, &compression_failed));
		result_len += WT_BLOCK_COMPRESS_SKIP;

		/*
		 * If compression fails, or doesn't gain us at least one unit of
		 * allocation, fallback to the original version.  This isn't
		 * unexpected: if compression doesn't work for some chunk of
		 * data for some reason (noting likely additional format/header
		 * information which compressed output requires), it just means
		 * the uncompressed version is as good as it gets, and that's
		 * what we use.
		 */
		if (compression_failed ||
		    buf->size / btree->allocsize <=
		    result_len / btree->allocsize) {
			ip = buf;
			WT_STAT_FAST_DATA_INCR(session, compress_write_fail);
		} else {
			compressed = 1;
			WT_STAT_FAST_DATA_INCR(session, compress_write);

			/*
			 * Copy in the skipped header bytes, set the final data
			 * size.
			 */
			memcpy(ctmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
			ctmp->size = result_len;
			ip = ctmp;
		}
	}
	/*
	 * Optionally encrypt the data.  We need to add in the original
	 * length, in case both compression and encryption are done.
	 */
	if ((kencryptor = btree->kencryptor) != NULL) {
		/*
		 * Get size needed for encrypted buffer.
		 */
		__wt_encrypt_size(session, kencryptor, ip->size, &size);

		WT_ERR(bm->write_size(bm, session, &size));
		WT_ERR(__wt_scr_alloc(session, size, &etmp));
		WT_ERR(__wt_encrypt(session,
		    kencryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp));

		encrypted = 1;
		ip = etmp;
	}
	dsk = ip->mem;

	/* If the buffer is compressed, set the flag. */
	if (compressed)
		F_SET(dsk, WT_PAGE_COMPRESSED);
	if (encrypted)
		F_SET(dsk, WT_PAGE_ENCRYPTED);

	/*
	 * We increment the block's write generation so it's easy to identify
	 * newer versions of blocks during salvage.  (It's common in WiredTiger,
	 * at least for the default block manager, for multiple blocks to be
	 * internally consistent with identical first and last keys, so we need
	 * a way to know the most recent state of the block.  We could check
	 * which leaf is referenced by a valid internal page, but that implies
	 * salvaging internal pages, which I don't want to do, and it's not
	 * as good anyway, because the internal page may not have been written
	 * after the leaf page was updated.  So, write generations it is.
	 *
	 * Nothing is locked at this point but two versions of a page with the
	 * same generation is pretty unlikely, and if we did, they're going to
	 * be roughly identical for the purposes of salvage, anyway.
	 */
	dsk->write_gen = ++btree->write_gen;

	/*
	 * Checksum the data if the buffer isn't compressed or checksums are
	 * configured.
	 */
	switch (btree->checksum) {
	case CKSUM_ON:
		data_cksum = 1;
		break;
	case CKSUM_OFF:
		data_cksum = 0;
		break;
	case CKSUM_UNCOMPRESSED:
	default:
		data_cksum = !compressed;
		break;
	}

	/* Call the block manager to write the block. */
	WT_ERR(checkpoint ?
	    bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) :
	    bm->write(bm, session, ip, addr, addr_sizep, data_cksum));

	WT_STAT_FAST_CONN_INCR(session, cache_write);
	WT_STAT_FAST_DATA_INCR(session, cache_write);
	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, dsk->mem_size);
	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, dsk->mem_size);

err:	__wt_scr_free(session, &ctmp);
	__wt_scr_free(session, &etmp);
	return (ret);
}
Example #6
0
/*
 * __wt_verify --
 *	Verify a file.
 */
int
__wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CKPT *ckptbase, *ckpt;
	WT_DECL_RET;
	WT_VSTUFF *vs, _vstuff;
	size_t root_addr_size;
	uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
	bool bm_start, quit;

	btree = S2BT(session);
	bm = btree->bm;
	ckptbase = NULL;
	bm_start = false;

	WT_CLEAR(_vstuff);
	vs = &_vstuff;
	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4));

	/* Check configuration strings. */
	WT_ERR(__verify_config(session, cfg, vs));

	/* Optionally dump specific block offsets. */
	WT_ERR(__verify_config_offsets(session, cfg, &quit));
	if (quit)
		goto done;

	/* Get a list of the checkpoints for this file. */
	WT_ERR(
	    __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase));

	/* Inform the underlying block manager we're verifying. */
	WT_ERR(bm->verify_start(bm, session, ckptbase, cfg));
	bm_start = true;

	/* Loop through the file's checkpoints, verifying each one. */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		WT_ERR(__wt_verbose(session, WT_VERB_VERIFY,
		    "%s: checkpoint %s", btree->dhandle->name, ckpt->name));

		/* Fake checkpoints require no work. */
		if (F_ISSET(ckpt, WT_CKPT_FAKE))
			continue;

		/* House-keeping between checkpoints. */
		__verify_checkpoint_reset(vs);

		if (WT_VRFY_DUMP(vs))
			WT_ERR(__wt_msg(session, "%s: checkpoint %s",
			    btree->dhandle->name, ckpt->name));

		/* Load the checkpoint. */
		WT_ERR(bm->checkpoint_load(bm, session,
		    ckpt->raw.data, ckpt->raw.size,
		    root_addr, &root_addr_size, true));

		/*
		 * Ignore trees with no root page.
		 * Verify, then discard the checkpoint from the cache.
		 */
		if (root_addr_size != 0 &&
		    (ret = __wt_btree_tree_open(
		    session, root_addr, root_addr_size)) == 0) {
			if (WT_VRFY_DUMP(vs))
				WT_ERR(__wt_msg(session, "Root: %s %s",
				    __wt_addr_string(session,
				    root_addr, root_addr_size, vs->tmp1),
				    __wt_page_type_string(
				    btree->root.page->type)));

			WT_WITH_PAGE_INDEX(session,
			    ret = __verify_tree(session, &btree->root, vs));

			WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD));
		}

		/* Unload the checkpoint. */
		WT_TRET(bm->checkpoint_unload(bm, session));
		WT_ERR(ret);

		/* Display the tree shape. */
		if (vs->dump_shape)
			WT_ERR(__verify_tree_shape(session, vs));
	}
Example #7
0
/*
 * __wt_txn_checkpoint_log --
 *	Write a log record for a checkpoint operation.
 */
int
__wt_txn_checkpoint_log(
    WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp)
{
	WT_DECL_RET;
	WT_DECL_ITEM(logrec);
	WT_LSN *ckpt_lsn;
	WT_TXN *txn;
	const char *fmt = WT_UNCHECKED_STRING(IIQIU);
	uint8_t *end, *p;
	size_t recsize;
	uint32_t i, rectype = WT_LOGREC_CHECKPOINT;

	txn = &session->txn;
	ckpt_lsn = &txn->ckpt_lsn;

	/*
	 * If this is a file sync, log it unless there is a full checkpoint in
	 * progress.
	 */
	if (!full) {
		if (txn->full_ckpt) {
			if (lsnp != NULL)
				*lsnp = *ckpt_lsn;
			return (0);
		} else
			return (__txn_log_file_sync(session, flags, lsnp));
	}

	switch (flags) {
	case WT_TXN_LOG_CKPT_PREPARE:
		txn->full_ckpt = 1;
		*ckpt_lsn = S2C(session)->log->alloc_lsn;
		break;

	case WT_TXN_LOG_CKPT_START:
		/* Take a copy of the transaction snapshot. */
		txn->ckpt_nsnapshot = txn->snapshot_count;
		recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
		WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
		p = txn->ckpt_snapshot->mem;
		end = p + recsize;
		for (i = 0; i < txn->snapshot_count; i++)
			WT_ERR(__wt_vpack_uint(
			    &p, WT_PTRDIFF(end, p), txn->snapshot[i]));
		break;

	case WT_TXN_LOG_CKPT_STOP:
		/*
		 * During a clean connection close, we get here without the
		 * prepare or start steps.  In that case, log the current LSN
		 * as the checkpoint LSN.
		 */
		if (!txn->full_ckpt) {
			txn->ckpt_nsnapshot = 0;
			*ckpt_lsn = S2C(session)->log->alloc_lsn;
		}

		/* Write the checkpoint log record. */
		WT_ERR(__wt_struct_size(session, &recsize, fmt,
		    rectype, ckpt_lsn->file, ckpt_lsn->offset,
		    txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
		WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));

		WT_ERR(__wt_struct_pack(session,
		    (uint8_t *)logrec->data + logrec->size, recsize, fmt,
		    rectype, ckpt_lsn->file, ckpt_lsn->offset,
		    txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
		logrec->size += (uint32_t)recsize;
		WT_ERR(__wt_log_write(session, logrec, lsnp, 0));

		/*
		 * If this full checkpoint completed successfully and there is
		 * no hot backup in progress, tell the logging subsystem the
		 * checkpoint LSN so that it can archive.
		 */
		if (!S2C(session)->hot_backup)
			WT_ERR(__wt_log_ckpt(session, ckpt_lsn));

		/* FALLTHROUGH */
	case WT_TXN_LOG_CKPT_FAIL:
		/* Cleanup any allocated resources */
		INIT_LSN(ckpt_lsn);
		txn->ckpt_nsnapshot = 0;
		__wt_scr_free(&txn->ckpt_snapshot);
		txn->full_ckpt = 0;
		break;
	}

err:	__wt_logrec_free(session, &logrec);
	return (ret);
}
Example #8
0
/*
 * __wt_lsm_tree_create --
 *	Create an LSM tree structure for the given name.
 */
int
__wt_lsm_tree_create(WT_SESSION_IMPL *session,
    const char *uri, int exclusive, const char *config)
{
	WT_CONFIG_ITEM cval;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;
	const char *cfg[] =
	    { WT_CONFIG_BASE(session, session_create), config, NULL };
	char *tmpconfig;

	/* If the tree is open, it already exists. */
	if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) {
		__wt_lsm_tree_release(session, lsm_tree);
		return (exclusive ? EEXIST : 0);
	}
	WT_RET_NOTFOUND_OK(ret);

	/*
	 * If the tree has metadata, it already exists.
	 *
	 * !!!
	 * Use a local variable: we don't care what the existing configuration
	 * is, but we don't want to overwrite the real config.
	 */
	if (__wt_metadata_search(session, uri, &tmpconfig) == 0) {
		__wt_free(session, tmpconfig);
		return (exclusive ? EEXIST : 0);
	}
	WT_RET_NOTFOUND_OK(ret);

	WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
	if (WT_STRING_MATCH("r", cval.str, cval.len))
		WT_RET_MSG(session, EINVAL,
		    "LSM trees cannot be configured as column stores");

	WT_RET(__wt_calloc_def(session, 1, &lsm_tree));

	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));

	WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
	WT_ERR(__wt_strndup(
	    session, cval.str, cval.len, &lsm_tree->key_format));
	WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval));
	WT_ERR(__wt_strndup(
	    session, cval.str, cval.len, &lsm_tree->value_format));

	WT_ERR(__wt_config_gets(session, cfg, "collator", &cval));
	WT_ERR(__wt_strndup(
	    session, cval.str, cval.len, &lsm_tree->collator_name));

	WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval));
	if (cval.val)
		F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
	else
		F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval));
	FLD_SET(lsm_tree->bloom,
	    (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED));
	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval));
	if (cval.val != 0)
		FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST);

	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
	    FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))
		WT_ERR_MSG(session, EINVAL,
		    "Bloom filters can only be created on newest and oldest "
		    "chunks if bloom filters are enabled");

	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval));
	if (cval.type == WT_CONFIG_ITEM_STRUCT) {
		cval.str++;
		cval.len -= 2;
	}
	WT_ERR(__wt_strndup(
	    session, cval.str, cval.len, &lsm_tree->bloom_config));

	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval));
	lsm_tree->bloom_bit_count = (uint32_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval));
	lsm_tree->bloom_hash_count = (uint32_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval));
	lsm_tree->chunk_max = (uint64_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval));
	lsm_tree->chunk_size = (uint64_t)cval.val;
	if (lsm_tree->chunk_size > lsm_tree->chunk_max)
		WT_ERR_MSG(session, EINVAL,
		    "Chunk size (chunk_size) must be smaller than or equal to "
		    "the maximum chunk size (chunk_max)");
	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval));
	lsm_tree->merge_max = (uint32_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval));
	lsm_tree->merge_min = (uint32_t)cval.val;
	if (lsm_tree->merge_min > lsm_tree->merge_max)
		WT_ERR_MSG(session, EINVAL,
		    "LSM merge_min must be less than or equal to merge_max");

	/*
	 * Set up the config for each chunk.
	 *
	 * Make the memory_page_max double the chunk size, so application
	 * threads don't immediately try to force evict the chunk when the
	 * worker thread clears the NO_EVICTION flag.
	 */
	WT_ERR(__wt_scr_alloc(session, 0, &buf));
	WT_ERR(__wt_buf_fmt(session, buf,
	    "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64,
	    config, 2 * lsm_tree->chunk_max));
	WT_ERR(__wt_strndup(
	    session, buf->data, buf->size, &lsm_tree->file_config));

	/* Create the first chunk and flush the metadata. */
	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

	/* Discard our partially populated handle. */
	ret = __lsm_tree_discard(session, lsm_tree);
	lsm_tree = NULL;

	/*
	 * Open our new tree and add it to the handle cache. Don't discard on
	 * error: the returned handle is NULL on error, and the metadata
	 * tracking macros handle cleaning up on failure.
	 */
	if (ret == 0)
		ret = __lsm_tree_open(session, uri, &lsm_tree);
	if (ret == 0)
		__wt_lsm_tree_release(session, lsm_tree);

	if (0) {
err:		WT_TRET(__lsm_tree_discard(session, lsm_tree));
	}
	__wt_scr_free(&buf);
	return (ret);
}
Example #9
0
/*
 * __wt_curindex_open --
 *	WT_SESSION->open_cursor method for index cursors.
 */
int
__wt_curindex_open(WT_SESSION_IMPL *session,
    const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __curindex_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __curindex_set_value,		/* set-value */
	    __curindex_compare,			/* compare */
	    __wt_cursor_equals,			/* equals */
	    __curindex_next,			/* next */
	    __curindex_prev,			/* prev */
	    __curindex_reset,			/* reset */
	    __curindex_search,			/* search */
	    __curindex_search_near,		/* search-near */
	    __wt_cursor_notsup,			/* insert */
	    __wt_cursor_notsup,			/* update */
	    __wt_cursor_notsup,			/* remove */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __curindex_close);			/* close */
	WT_CURSOR_INDEX *cindex;
	WT_CURSOR *cursor;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_INDEX *idx;
	WT_TABLE *table;
	const char *columns, *idxname, *tablename;
	size_t namesize;

	tablename = uri;
	if (!WT_PREFIX_SKIP(tablename, "index:") ||
	    (idxname = strchr(tablename, ':')) == NULL)
		WT_RET_MSG(session, EINVAL, "Invalid cursor URI: '%s'", uri);
	namesize = (size_t)(idxname - tablename);
	++idxname;

	if ((ret = __wt_schema_get_table(session,
	    tablename, namesize, false, &table)) != 0) {
		if (ret == WT_NOTFOUND)
			WT_RET_MSG(session, EINVAL,
			    "Cannot open cursor '%s' on unknown table", uri);
		return (ret);
	}

	columns = strchr(idxname, '(');
	if (columns == NULL)
		namesize = strlen(idxname);
	else
		namesize = (size_t)(columns - idxname);

	if ((ret = __wt_schema_open_index(
	    session, table, idxname, namesize, &idx)) != 0) {
		__wt_schema_release_table(session, table);
		return (ret);
	}
	WT_RET(__wt_calloc_one(session, &cindex));

	cursor = &cindex->iface;
	*cursor = iface;
	cursor->session = &session->iface;

	cindex->table = table;
	cindex->index = idx;
	cindex->key_plan = idx->key_plan;
	cindex->value_plan = idx->value_plan;

	cursor->internal_uri = idx->name;
	cursor->key_format = idx->idxkey_format;
	cursor->value_format = table->value_format;

	/*
	 * XXX
	 * A very odd corner case is an index with a recno key.
	 * The only way to get here is by creating an index on a column store
	 * using only the primary's recno as the index key.  Disallow that for
	 * now.
	 */
	if (WT_CURSOR_RECNO(cursor))
		WT_ERR_MSG(session, WT_ERROR,
		    "Column store indexes based on a record number primary "
		    "key are not supported");

	/* Handle projections. */
	if (columns != NULL) {
		WT_ERR(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(__wt_struct_reformat(session, table,
		    columns, strlen(columns), NULL, false, tmp));
		WT_ERR(__wt_strndup(
		    session, tmp->data, tmp->size, &cursor->value_format));

		WT_ERR(__wt_buf_init(session, tmp, 0));
		WT_ERR(__wt_struct_plan(session, table,
		    columns, strlen(columns), false, tmp));
		WT_ERR(__wt_strndup(
		    session, tmp->data, tmp->size, &cindex->value_plan));
	}

	WT_ERR(__wt_cursor_init(
	    cursor, cursor->internal_uri, owner, cfg, cursorp));

	WT_ERR(__wt_open_cursor(
	    session, idx->source, cursor, cfg, &cindex->child));

	/* Open the column groups needed for this index cursor. */
	WT_ERR(__curindex_open_colgroups(session, cindex, cfg));

	if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
		__wt_json_column_init(
		    cursor, table->key_format, &idx->colconf, &table->colconf);

	if (0) {
err:		WT_TRET(__curindex_close(cursor));
		*cursorp = NULL;
	}

	__wt_scr_free(session, &tmp);
	return (ret);
}
Example #10
0
/*
 * __wt_schema_open_indices --
 *	Open the indices for a table.
 */
int
__wt_schema_open_index(WT_SESSION_IMPL *session,
    WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
{
	WT_CURSOR *cursor;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_INDEX *idx;
	u_int i;
	int cmp, match;
	const char *idxconf, *name, *tablename, *uri;

	/* Check if we've already done the work. */
	if (idxname == NULL && table->idx_complete)
		return (0);

	cursor = NULL;
	idx = NULL;

	/* Build a search key. */
	tablename = table->name;
	(void)WT_PREFIX_SKIP(tablename, "table:");
	WT_ERR(__wt_scr_alloc(session, 512, &tmp));
	WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename));

	/* Find matching indices. */
	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
	cursor->set_key(cursor, tmp->data);
	if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
		ret = cursor->next(cursor);
	for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor, &uri));
		name = uri;
		if (!WT_PREFIX_SKIP(name, tmp->data))
			break;

		/* Is this the index we are looking for? */
		match = idxname == NULL || WT_STRING_MATCH(name, idxname, len);

		/*
		 * Ensure there is space, including if we have to make room for
		 * a new entry in the middle of the list.
		 */
		WT_ERR(__wt_realloc_def(session, &table->idx_alloc,
		    WT_MAX(i, table->nindices) + 1, &table->indices));

		/* Keep the in-memory list in sync with the metadata. */
		cmp = 0;
		while (table->indices[i] != NULL &&
		    (cmp = strcmp(uri, table->indices[i]->name)) > 0) {
			/* Index no longer exists, remove it. */
			__wt_free(session, table->indices[i]);
			memmove(&table->indices[i], &table->indices[i + 1],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[--table->nindices] = NULL;
		}
		if (cmp < 0) {
			/* Make room for a new index. */
			memmove(&table->indices[i + 1], &table->indices[i],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[i] = NULL;
			++table->nindices;
		}

		if (!match)
			continue;

		if (table->indices[i] == NULL) {
			WT_ERR(cursor->get_value(cursor, &idxconf));
			WT_ERR(__wt_calloc_def(session, 1, &idx));
			WT_ERR(__wt_strdup(session, uri, &idx->name));
			WT_ERR(__wt_strdup(session, idxconf, &idx->config));
			WT_ERR(__open_index(session, table, idx));

			table->indices[i] = idx;
			idx = NULL;
		}

		/* If we were looking for a single index, we're done. */
		if (indexp != NULL)
			*indexp = table->indices[i];
		if (idxname != NULL)
			break;
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* If we did a full pass, we won't need to do it again. */
	if (idxname == NULL) {
		table->nindices = i;
		table->idx_complete = 1;
	}

err:	__wt_scr_free(&tmp);
	if (idx != NULL)
		__wt_schema_destroy_index(session, idx);
	if (cursor != NULL)
		WT_TRET(cursor->close(cursor));
	return (ret);
}
Example #11
0
/*
 * __wt_schema_open_colgroups --
 *	Open the column groups for a table.
 */
int
__wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
{
	WT_COLGROUP *colgroup;
	WT_CONFIG cparser;
	WT_CONFIG_ITEM ckey, cval;
	WT_DECL_RET;
	WT_DECL_ITEM(buf);
	const char *cgconfig;
	u_int i;

	if (table->cg_complete)
		return (0);

	WT_RET(__wt_scr_alloc(session, 0, &buf));
	colgroup = NULL;

	WT_RET(__wt_config_subinit(session, &cparser, &table->cgconf));

	/* Open each column group. */
	for (i = 0; i < WT_COLGROUPS(table); i++) {
		if (table->ncolgroups > 0)
			WT_ERR(__wt_config_next(&cparser, &ckey, &cval));
		else
			WT_CLEAR(ckey);

		/*
		 * Always open from scratch: we may have failed part of the way
		 * through opening a table, or column groups may have changed.
		 */
		if (table->cgroups[i] != NULL) {
			__wt_schema_destroy_colgroup(
			    session, table->cgroups[i]);
			table->cgroups[i] = NULL;
		}

		WT_ERR(__wt_schema_colgroup_name(session, table,
		    ckey.str, ckey.len, buf));
		if ((ret = __wt_metadata_search(
		    session, buf->data, &cgconfig)) != 0) {
			/* It is okay if the table is incomplete. */
			if (ret == WT_NOTFOUND)
				ret = 0;
			goto err;
		}

		WT_ERR(__wt_calloc_def(session, 1, &colgroup));
		colgroup->name = __wt_buf_steal(session, buf, NULL);
		colgroup->config = cgconfig;
		WT_ERR(__wt_config_getones(session,
		    colgroup->config, "columns", &colgroup->colconf));
		WT_ERR(__wt_config_getones(
		    session, colgroup->config, "source", &cval));
		WT_ERR(__wt_buf_fmt(session, buf, "%.*s",
		    (int)cval.len, cval.str));
		colgroup->source = __wt_buf_steal(session, buf, NULL);
		table->cgroups[i] = colgroup;
		colgroup = NULL;
	}

	if (!table->is_simple) {
		WT_ERR(__wt_table_check(session, table));

		WT_ERR(__wt_struct_plan(session,
		    table, table->colconf.str, table->colconf.len, 1, buf));
		table->plan = __wt_buf_steal(session, buf, NULL);
	}

	table->cg_complete = 1;

err:	__wt_scr_free(&buf);
	if (colgroup != NULL)
		__wt_schema_destroy_colgroup(session, colgroup);
	return (ret);
}
Example #12
0
/*
 * ___open_index --
 *	Open an index.
 */
static int
__open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx)
{
	WT_CONFIG colconf;
	WT_CONFIG_ITEM ckey, cval;
	WT_DECL_ITEM(buf);
	WT_DECL_ITEM(plan);
	WT_DECL_RET;
	u_int cursor_key_cols, i;

	WT_ERR(__wt_scr_alloc(session, 0, &buf));

	/* Get the data source from the index config. */
	WT_ERR(__wt_config_getones(session, idx->config, "source", &cval));
	WT_ERR(__wt_buf_fmt(
	    session, buf, "%.*s", (int)cval.len, cval.str));
	idx->source = __wt_buf_steal(session, buf, NULL);
	idx->need_value = WT_PREFIX_MATCH(idx->source, "lsm:");

	WT_ERR(__wt_config_getones(session, idx->config, "key_format", &cval));
	WT_ERR(__wt_buf_fmt(
	    session, buf, "%.*s", (int)cval.len, cval.str));
	idx->key_format = __wt_buf_steal(session, buf, NULL);

	/*
	 * The key format for an index is somewhat subtle: the application
	 * specifies a set of columns that it will use for the key, but the
	 * engine usually adds some hidden columns in order to derive the
	 * primary key.  These hidden columns are part of the file's key.
	 *
	 * The file's key_format is stored persistently, we need to calculate
	 * the index cursor key format (which will usually omit some of those
	 * keys).
	 */
	WT_ERR(__wt_config_getones(
	    session, idx->config, "columns", &idx->colconf));

	/* Start with the declared index columns. */
	WT_ERR(__wt_config_subinit(session, &colconf, &idx->colconf));
	cursor_key_cols = 0;
	while ((ret = __wt_config_next(&colconf, &ckey, &cval)) == 0) {
		WT_ERR(__wt_buf_catfmt(
		    session, buf, "%.*s,", (int)ckey.len, ckey.str));
		++cursor_key_cols;
	}
	if (ret != 0 && ret != WT_NOTFOUND)
		goto err;

	/*
	 * Now add any primary key columns from the table that are not
	 * already part of the index key.
	 */
	WT_ERR(__wt_config_subinit(session, &colconf, &table->colconf));
	for (i = 0; i < table->nkey_columns &&
	    (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0;
	    i++) {
		/*
		 * If the primary key column is already in the secondary key,
		 * don't add it again.
		 */
		if (__wt_config_subgetraw(
		    session, &idx->colconf, &ckey, &cval) == 0)
			continue;
		WT_ERR(__wt_buf_catfmt(
		    session, buf, "%.*s,", (int)ckey.len, ckey.str));
	}
	if (ret != 0 && ret != WT_NOTFOUND)
		goto err;

	WT_ERR(__wt_scr_alloc(session, 0, &plan));
	WT_ERR(__wt_struct_plan(session, table, buf->data, buf->size, 0, plan));
	idx->key_plan = __wt_buf_steal(session, plan, NULL);

	/* Set up the cursor key format (the visible columns). */
	WT_ERR(__wt_buf_init(session, buf, 0));
	WT_ERR(__wt_struct_truncate(session,
	    idx->key_format, cursor_key_cols, buf));
	idx->idxkey_format = __wt_buf_steal(session, buf, NULL);

	/* By default, index cursor values are the table value columns. */
	/* TODO Optimize to use index columns in preference to table lookups. */
	WT_ERR(__wt_struct_plan(session,
	    table, table->colconf.str, table->colconf.len, 1, plan));
	idx->value_plan = __wt_buf_steal(session, plan, NULL);

err:	__wt_scr_free(&buf);
	__wt_scr_free(&plan);
	return (ret);
}
Example #13
0
/*
 * __verify_dsk_row --
 *	Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it.
 */
static int
__verify_dsk_row(
    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_ITEM(current);
	WT_DECL_ITEM(last_ovfl);
	WT_DECL_ITEM(last_pfx);
	WT_DECL_RET;
	WT_ITEM *last;
	enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
	void *huffman;
	uint32_t cell_num, cell_type, i, key_cnt, prefix;
	uint8_t *end;
	int cmp;

	btree = S2BT(session);
	bm = btree->bm;
	unpack = &_unpack;
	huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key;

	WT_ERR(__wt_scr_alloc(session, 0, &current));
	WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
	WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
	last = last_ovfl;

	end = (uint8_t *)dsk + dsk->mem_size;

	last_cell_type = FIRST;
	cell_num = 0;
	key_cnt = 0;
	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
		++cell_num;

		/* Carefully unpack the cell. */
		if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
			ret = __err_cell_corrupted(session, cell_num, addr);
			goto err;
		}

		/* Check the raw and collapsed cell types. */
		WT_ERR(__err_cell_type(
		    session, cell_num, addr, unpack->raw, dsk->type));
		WT_ERR(__err_cell_type(
		    session, cell_num, addr, unpack->type, dsk->type));
		cell_type = unpack->type;

		/*
		 * Check ordering relationships between the WT_CELL entries.
		 * For row-store internal pages, check for:
		 *	two values in a row,
		 *	two keys in a row,
		 *	a value as the first cell on a page.
		 * For row-store leaf pages, check for:
		 *	two values in a row,
		 *	a value as the first cell on a page.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
		case WT_CELL_KEY_OVFL:
			++key_cnt;
			switch (last_cell_type) {
			case FIRST:
			case WAS_VALUE:
				break;
			case WAS_KEY:
				if (dsk->type == WT_PAGE_ROW_LEAF)
					break;
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent keys",
				    cell_num - 1, addr);
			}
			last_cell_type = WAS_KEY;
			break;
		case WT_CELL_ADDR_DEL:
		case WT_CELL_ADDR_INT:
		case WT_CELL_ADDR_LEAF:
		case WT_CELL_ADDR_LEAF_NO:
		case WT_CELL_VALUE:
		case WT_CELL_VALUE_OVFL:
			switch (last_cell_type) {
			case FIRST:
				WT_ERR_VRFY(session,
				    "page at %s begins with a value", addr);
			case WAS_KEY:
				break;
			case WAS_VALUE:
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent values",
				    cell_num - 1, addr);
			}
			last_cell_type = WAS_VALUE;
			break;
		}

		/* Check if any referenced item has a valid address. */
		switch (cell_type) {
		case WT_CELL_ADDR_DEL:
		case WT_CELL_ADDR_INT:
		case WT_CELL_ADDR_LEAF:
		case WT_CELL_ADDR_LEAF_NO:
		case WT_CELL_KEY_OVFL:
		case WT_CELL_VALUE_OVFL:
			if (!bm->addr_valid(bm,
			    session, unpack->data, unpack->size))
				goto eof;
			break;
		}

		/*
		 * Remaining checks are for key order and prefix compression.
		 * If this cell isn't a key, we're done, move to the next cell.
		 * If this cell is an overflow item, instantiate the key and
		 * compare it with the last key. Otherwise, we have to deal with
		 * prefix compression.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
			break;
		case WT_CELL_KEY_OVFL:
			WT_ERR(__wt_dsk_cell_data_ref(
			    session, dsk->type, unpack, current));
			goto key_compare;
		default:
			/* Not a key -- continue with the next cell. */
			continue;
		}

		/*
		 * Prefix compression checks.
		 *
		 * Confirm the first non-overflow key on a page has a zero
		 * prefix compression count.
		 */
		prefix = unpack->prefix;
		if (last_pfx->size == 0 && prefix != 0)
			WT_ERR_VRFY(session,
			    "the %" PRIu32 " key on page at %s is the first "
			    "non-overflow key on the page and has a non-zero "
			    "prefix compression value",
			    cell_num, addr);

		/* Confirm the prefix compression count is possible. */
		if (cell_num > 1 && prefix > last->size)
			WT_ERR_VRFY(session,
			    "key %" PRIu32 " on page at %s has a prefix "
			    "compression count of %" PRIu32 ", larger than "
			    "the length of the previous key, %" WT_SIZET_FMT,
			    cell_num, addr, prefix, last->size);

		/*
		 * If Huffman decoding required, unpack the cell to build the
		 * key, then resolve the prefix.  Else, we can do it faster
		 * internally because we don't have to shuffle memory around as
		 * much.
		 */
		if (huffman != NULL) {
			WT_ERR(__wt_dsk_cell_data_ref(
			    session, dsk->type, unpack, current));

			/*
			 * If there's a prefix, make sure there's enough buffer
			 * space, then shift the decoded data past the prefix
			 * and copy the prefix into place.  Take care with the
			 * pointers: current->data may be pointing inside the
			 * buffer.
			 */
			if (prefix != 0) {
				WT_ERR(__wt_buf_grow(
				    session, current, prefix + current->size));
				memmove((uint8_t *)current->mem + prefix,
				    current->data, current->size);
				memcpy(current->mem, last->data, prefix);
				current->data = current->mem;
				current->size += prefix;
			}
		} else {
			/*
			 * Get the cell's data/length and make sure we have
			 * enough buffer space.
			 */
			WT_ERR(__wt_buf_init(
			    session, current, prefix + unpack->size));

			/* Copy the prefix then the data into place. */
			if (prefix != 0)
				memcpy(current->mem, last->data, prefix);
			memcpy((uint8_t *)current->mem + prefix, unpack->data,
			    unpack->size);
			current->size = prefix + unpack->size;
		}

key_compare:	/*
		 * Compare the current key against the last key.
		 *
		 * Be careful about the 0th key on internal pages: we only store
		 * the first byte and custom collators may not be able to handle
		 * truncated keys.
		 */
		if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) ||
		    (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) {
			WT_ERR(__wt_compare(
			    session, btree->collator, last, current, &cmp));
			if (cmp >= 0)
				WT_ERR_VRFY(session,
				    "the %" PRIu32 " and %" PRIu32 " keys on "
				    "page at %s are incorrectly sorted",
				    cell_num - 2, cell_num, addr);
		}

		/*
		 * Swap the buffers: last always references the last key entry,
		 * last_pfx and last_ovfl reference the last prefix-compressed
		 * and last overflow key entries.  Current gets pointed to the
		 * buffer we're not using this time around, which is where the
		 * next key goes.
		 */
		last = current;
		if (cell_type == WT_CELL_KEY) {
			current = last_pfx;
			last_pfx = last;
		} else {
			current = last_ovfl;
			last_ovfl = last;
		}
		WT_ASSERT(session, last != current);
	}
Example #14
0
/*
 * __wt_txn_checkpoint_log --
 *	Write a log record for a checkpoint operation.
 */
int
__wt_txn_checkpoint_log(
    WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(logrec);
	WT_DECL_RET;
	WT_ITEM *ckpt_snapshot, empty;
	WT_LSN *ckpt_lsn;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	uint8_t *end, *p;
	size_t recsize;
	uint32_t i, rectype;
	const char *fmt;

	conn = S2C(session);
	txn_global = &conn->txn_global;
	txn = &session->txn;
	ckpt_lsn = &txn->ckpt_lsn;

	/*
	 * If this is a file sync, log it unless there is a full checkpoint in
	 * progress.
	 */
	if (!full) {
		if (txn->full_ckpt) {
			if (lsnp != NULL)
				*lsnp = *ckpt_lsn;
			return (0);
		}
		return (__txn_log_file_sync(session, flags, lsnp));
	}

	switch (flags) {
	case WT_TXN_LOG_CKPT_PREPARE:
		txn->full_ckpt = true;

		if (conn->compat_major >= WT_LOG_V2) {
			/*
			 * Write the system log record containing a checkpoint
			 * start operation.
			 */
			rectype = WT_LOGREC_SYSTEM;
			fmt = WT_UNCHECKED_STRING(I);
			WT_ERR(__wt_struct_size(
			    session, &recsize, fmt, rectype));
			WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));

			WT_ERR(__wt_struct_pack(session,
			    (uint8_t *)logrec->data + logrec->size, recsize,
			    fmt, rectype));
			logrec->size += (uint32_t)recsize;
			WT_ERR(__wt_logop_checkpoint_start_pack(
			    session, logrec));
			WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0));
		} else {
			WT_ERR(__wt_log_printf(session,
			    "CHECKPOINT: Starting record"));
			WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
		}

		/*
		 * We take and immediately release the visibility lock.
		 * Acquiring the write lock guarantees that any transaction
		 * that has written to the log has also made its transaction
		 * visible at this time.
		 */
		__wt_writelock(session, &txn_global->visibility_rwlock);
		__wt_writeunlock(session, &txn_global->visibility_rwlock);

		/*
		 * We need to make sure that the log records in the checkpoint
		 * LSN are on disk.  In particular to make sure that the
		 * current log file exists.
		 */
		WT_ERR(__wt_log_force_sync(session, ckpt_lsn));
		break;
	case WT_TXN_LOG_CKPT_START:
		/* Take a copy of the transaction snapshot. */
		txn->ckpt_nsnapshot = txn->snapshot_count;
		recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
		WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
		p = txn->ckpt_snapshot->mem;
		end = p + recsize;
		for (i = 0; i < txn->snapshot_count; i++)
			WT_ERR(__wt_vpack_uint(
			    &p, WT_PTRDIFF(end, p), txn->snapshot[i]));
		break;
	case WT_TXN_LOG_CKPT_STOP:
		/*
		 * During a clean connection close, we get here without the
		 * prepare or start steps.  In that case, log the current LSN
		 * as the checkpoint LSN.
		 */
		if (!txn->full_ckpt) {
			txn->ckpt_nsnapshot = 0;
			WT_CLEAR(empty);
			ckpt_snapshot = &empty;
			WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
		} else
			ckpt_snapshot = txn->ckpt_snapshot;

		/* Write the checkpoint log record. */
		rectype = WT_LOGREC_CHECKPOINT;
		fmt = WT_UNCHECKED_STRING(IIIIu);
		WT_ERR(__wt_struct_size(session, &recsize,
		    fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
		    txn->ckpt_nsnapshot, ckpt_snapshot));
		WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));

		WT_ERR(__wt_struct_pack(session,
		    (uint8_t *)logrec->data + logrec->size, recsize,
		    fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
		    txn->ckpt_nsnapshot, ckpt_snapshot));
		logrec->size += (uint32_t)recsize;
		WT_ERR(__wt_log_write(session, logrec, lsnp,
		    F_ISSET(conn, WT_CONN_CKPT_SYNC) ?
		    WT_LOG_FSYNC : 0));

		/*
		 * If this full checkpoint completed successfully and there is
		 * no hot backup in progress and this is not an unclean
		 * recovery, tell the logging subsystem the checkpoint LSN so
		 * that it can archive.  Do not update the logging checkpoint
		 * LSN if this is during a clean connection close, only during
		 * a full checkpoint.  A clean close may not update any
		 * metadata LSN and we do not want to archive in that case.
		 */
		if (!conn->hot_backup &&
		    (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) ||
		    FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) &&
		    txn->full_ckpt)
			__wt_log_ckpt(session, ckpt_lsn);

		/* FALLTHROUGH */
	case WT_TXN_LOG_CKPT_CLEANUP:
		/* Cleanup any allocated resources */
		WT_INIT_LSN(ckpt_lsn);
		txn->ckpt_nsnapshot = 0;
		__wt_scr_free(session, &txn->ckpt_snapshot);
		txn->full_ckpt = false;
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

err:	__wt_logrec_free(session, &logrec);
	return (ret);
}
Example #15
0
/*
 * __wt_log_newfile --
 *	Create the next log file and write the file header record into it.
 */
int
__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOG_DESC *desc;
	WT_LOG_RECORD *logrec;
	WT_LOGSLOT tmp;
	WT_MYSLOT myslot;

	conn = S2C(session);
	log = conn->log;

	/*
	 * Set aside the log file handle to be closed later.  Other threads
	 * may still be using it to write to the log.
	 */
	WT_ASSERT(session, log->log_close_fh == NULL);
	log->log_close_fh = log->log_fh;
	log->fileid++;
	WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid));
	log->alloc_lsn.file = log->fileid;
	log->alloc_lsn.offset = log->log_fh->size;

	/*
	 * Set up the log descriptor record.  Use a scratch buffer to
	 * get correct alignment for direct I/O.
	 */
	WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
	WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
	memset(buf->mem, 0, log->allocsize);
	logrec = (WT_LOG_RECORD *)buf->mem;
	desc = (WT_LOG_DESC *)logrec->record;
	desc->log_magic = WT_LOG_MAGIC;
	desc->majorv = WT_LOG_MAJOR_VERSION;
	desc->minorv = WT_LOG_MINOR_VERSION;
	desc->log_size = (uint64_t)conn->log_file_max;

	/*
	 * Now that the record is set up, initialize the record header.
	 */
	logrec->len = log->allocsize;
	logrec->checksum = 0;
	logrec->checksum = __wt_cksum(logrec, log->allocsize);
	WT_CLEAR(tmp);
	myslot.slot = &tmp;
	myslot.offset = 0;

	/*
	 * Recursively call __log_acquire to allocate log space for the
	 * log descriptor record.  Call __log_fill to write it, but we
	 * do not need to call __log_release because we're not waiting for
	 * earlier operations to complete.
	 */
	WT_ERR(__log_acquire(session, logrec->len, &tmp));
	WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));

	/*
	 * If we're called from connection creation code, we need to update
	 * the LSNs since we're the only write in progress.
	 */
	if (conn_create) {
		WT_ERR(__wt_fsync(session, log->log_fh));
		log->sync_lsn = tmp.slot_end_lsn;
		log->write_lsn = tmp.slot_end_lsn;
	}

err:	__wt_scr_free(&buf);
	return (ret);
}
Example #16
0
/*
 * __wt_las_sweep --
 *	Sweep the lookaside table.
 */
int
__wt_las_sweep(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *cursor;
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_RET;
	WT_ITEM *key;
	uint64_t cnt, las_counter, las_txnid;
	uint32_t las_id, session_flags;
	int notused;

	conn = S2C(session);
	cursor = NULL;
	key = &conn->las_sweep_key;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));

	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * If we're not starting a new sweep, position the cursor using the key
	 * from the last call (we don't care if we're before or after the key,
	 * just roughly in the same spot is fine).
	 */
	if (conn->las_sweep_call != 0 && key->data != NULL) {
		__wt_cursor_set_raw_key(cursor, key);
		if ((ret = cursor->search_near(cursor, &notused)) != 0)
			goto srch_notfound;
	}

	/*
	 * The sweep server wakes up every 10 seconds (by default), it's a slow
	 * moving thread. Try to review the entire lookaside table once every 5
	 * minutes, or every 30 calls.
	 *
	 * The reason is because the lookaside table exists because we're seeing
	 * cache/eviction pressure (it allows us to trade performance and disk
	 * space for cache space), and it's likely lookaside blocks are being
	 * evicted, and reading them back in doesn't help things. A trickier,
	 * but possibly better, alternative might be to review all lookaside
	 * blocks in the cache in order to get rid of them, and slowly review
	 * lookaside blocks that have already been evicted.
	 *
	 * We can't know for sure how many records are in the lookaside table,
	 * the cursor insert and remove statistics aren't updated atomically.
	 * Start with reviewing 100 rows, and if it takes more than the target
	 * number of calls to finish, increase the number of rows checked on
	 * each call; if it takes less than the target calls to finish, then
	 * decrease the number of rows reviewed on each call (but never less
	 * than 100).
	 */
#define	WT_SWEEP_LOOKASIDE_MIN_CNT	100
#define	WT_SWEEP_LOOKASIDE_PASS_TARGET	 30
	++conn->las_sweep_call;
	if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT)
		cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT;

	/* Walk the file. */
	for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
		/*
		 * If the loop terminates after completing a work unit, we will
		 * continue the table sweep next time. Get a local copy of the
		 * sweep key, we're going to reset the cursor; do so before
		 * calling cursor.remove, cursor.remove can discard our hazard
		 * pointer and the page could be evicted from underneath us.
		 */
		if (cnt == 1) {
			WT_ERR(__wt_cursor_get_raw_key(cursor, key));
			if (!WT_DATA_IN_ITEM(key))
				WT_ERR(__wt_buf_set(
				    session, key, key->data, key->size));
		}

		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * If the on-page record transaction ID associated with the
		 * record is globally visible, the record can be discarded.
		 *
		 * Cursor opened overwrite=true: won't return WT_NOTFOUND should
		 * another thread remove the record before we do, and the cursor
		 * remains positioned in that case.
		 */
		if (__wt_txn_visible_all(session, las_txnid))
			WT_ERR(cursor->remove(cursor));
	}

	/*
	 * When reaching the lookaside table end or the target number of calls,
	 * adjust the row count. Decrease/increase the row count depending on
	 * if the number of calls is less/more than the target.
	 */
	if (ret == WT_NOTFOUND ||
	    conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) {
		if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET &&
		    conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT)
			conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT;
		if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET)
			conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT;
	}

srch_notfound:
	if (ret == WT_NOTFOUND)
		conn->las_sweep_call = 0;

	WT_ERR_NOTFOUND_OK(ret);

	if (0) {
err:		__wt_buf_free(session, key);
	}

	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));

	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);

	return (ret);
}
Example #17
0
/*
 * __open_verbose --
 *	Optionally output a verbose message on handle open.
 */
static inline int
__open_verbose(
    WT_SESSION_IMPL *session, const char *name, int file_type, u_int flags)
{
#ifdef HAVE_VERBOSE
	WT_DECL_RET;
	WT_DECL_ITEM(tmp);
	const char *file_type_tag, *sep;

	if (!WT_VERBOSE_ISSET(session, WT_VERB_FILEOPS))
		return (0);

	/*
	 * It's useful to track file opens when debugging platforms, take some
	 * effort to output good tracking information.
	 */

	switch (file_type) {
	case WT_FS_OPEN_FILE_TYPE_CHECKPOINT:
		file_type_tag = "checkpoint";
		break;
	case WT_FS_OPEN_FILE_TYPE_DATA:
		file_type_tag = "data";
		break;
	case WT_FS_OPEN_FILE_TYPE_DIRECTORY:
		file_type_tag = "directory";
		break;
	case WT_FS_OPEN_FILE_TYPE_LOG:
		file_type_tag = "log";
		break;
	case WT_FS_OPEN_FILE_TYPE_REGULAR:
		file_type_tag = "regular";
		break;
	default:
		file_type_tag = "unknown open type";
		break;
	}

	WT_RET(__wt_scr_alloc(session, 0, &tmp));
	sep = " (";
#define	WT_FS_OPEN_VERBOSE_FLAG(f, name)				\
	if (LF_ISSET(f)) {						\
		WT_ERR(__wt_buf_catfmt(					\
		    session, tmp, "%s%s", sep, name));			\
		sep = ", ";						\
	}

	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_CREATE, "create");
	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_DIRECTIO, "direct-IO");
	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_EXCLUSIVE, "exclusive");
	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_FIXED, "fixed");
	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_READONLY, "readonly");

	if (tmp->size != 0)
		WT_ERR(__wt_buf_catfmt(session, tmp, ")"));

	__wt_verbose(session, WT_VERB_FILEOPS,
	    "%s: file-open: type %s%s",
	    name, file_type_tag, tmp->size == 0 ? "" : (char *)tmp->data);

err:	__wt_scr_free(session, &tmp);
	return (ret);
#else
	WT_UNUSED(session);
	WT_UNUSED(name);
	WT_UNUSED(file_type);
	WT_UNUSED(flags);
	return (0);
#endif
}
Example #18
0
/*
 * __wt_lsm_meta_write --
 *	Write the metadata for an LSM tree.
 */
int
__wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	u_int i;
	int first;

	WT_RET(__wt_scr_alloc(session, 0, &buf));
	WT_ERR(__wt_buf_fmt(session, buf,
	    "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)",
	    lsm_tree->key_format, lsm_tree->value_format,
	    lsm_tree->bloom_config, lsm_tree->file_config));
	if (lsm_tree->collator_name != NULL)
		WT_ERR(__wt_buf_catfmt(
		    session, buf, ",collator=%s", lsm_tree->collator_name));
	WT_ERR(__wt_buf_catfmt(session, buf,
	    ",last=%" PRIu32
	    ",chunk_count_limit=%" PRIu32
	    ",chunk_max=%" PRIu64
	    ",chunk_size=%" PRIu64
	    ",auto_throttle=%" PRIu32
	    ",merge_max=%" PRIu32
	    ",merge_min=%" PRIu32
	    ",bloom=%" PRIu32
	    ",bloom_bit_count=%" PRIu32
	    ",bloom_hash_count=%" PRIu32,
	    lsm_tree->last, lsm_tree->chunk_count_limit,
	    lsm_tree->chunk_max, lsm_tree->chunk_size,
	    F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0,
	    lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count));
	WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=["));
	for (i = 0; i < lsm_tree->nchunks; i++) {
		chunk = lsm_tree->chunk[i];
		if (i > 0)
			WT_ERR(__wt_buf_catfmt(session, buf, ","));
		WT_ERR(__wt_buf_catfmt(session, buf, "id=%" PRIu32, chunk->id));
		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_buf_catfmt(session, buf, ",bloom"));
		if (chunk->size != 0)
			WT_ERR(__wt_buf_catfmt(session, buf,
			    ",chunk_size=%" PRIu64, chunk->size));
		if (chunk->count != 0)
			WT_ERR(__wt_buf_catfmt(
			    session, buf, ",count=%" PRIu64, chunk->count));
		WT_ERR(__wt_buf_catfmt(
		    session, buf, ",generation=%" PRIu32, chunk->generation));
	}
	WT_ERR(__wt_buf_catfmt(session, buf, "]"));
	WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=["));
	first = 1;
	for (i = 0; i < lsm_tree->nold_chunks; i++) {
		chunk = lsm_tree->old_chunks[i];
		WT_ASSERT(session, chunk != NULL);
		if (first)
			first = 0;
		else
			WT_ERR(__wt_buf_catfmt(session, buf, ","));
		WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri));
		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_buf_catfmt(
			    session, buf, ",bloom=\"%s\"", chunk->bloom_uri));
	}
	WT_ERR(__wt_buf_catfmt(session, buf, "]"));
	ret = __wt_metadata_update(session, lsm_tree->name, buf->data);
	WT_ERR(ret);

err:	__wt_scr_free(session, &buf);
	return (ret);
}
Example #19
0
/*
 * __wt_huffman_read --
 *	Read a Huffman table from a file.
 */
static int
__wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
    struct __wt_huffman_table **tablep, u_int *entriesp, u_int *numbytesp)
{
	struct __wt_huffman_table *table, *tp;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_FSTREAM *fs;
	int64_t symbol, frequency;
	u_int entries, lineno;
	int n;
	bool is_utf8;

	*tablep = NULL;
	*entriesp = *numbytesp = 0;

	fs = NULL;
	table = NULL;

	/*
	 * Try and open the backing file.
	 */
	WT_RET(__huffman_confchk_file(session, ip, &is_utf8, &fs));

	/*
	 * UTF-8 table is 256 bytes, with a range of 0-255.
	 * UTF-16 is 128KB (2 * 65536) bytes, with a range of 0-65535.
	 */
	if (is_utf8) {
		entries = UINT8_MAX;
		*numbytesp = 1;
		WT_ERR(__wt_calloc_def(session, entries, &table));
	} else {
		entries = UINT16_MAX;
		*numbytesp = 2;
		WT_ERR(__wt_calloc_def(session, entries, &table));
	}

	WT_ERR(__wt_scr_alloc(session, 0, &tmp));
	for (tp = table, lineno = 1;; ++tp, ++lineno) {
		WT_ERR(__wt_getline(session, fs, tmp));
		if (tmp->size == 0)
			break;
		n = sscanf(
		    tmp->data, "%" SCNi64 " %" SCNi64, &symbol, &frequency);
		/*
		 * Entries is 0-based, that is, there are (entries +1) possible
		 * values that can be configured. The line number is 1-based, so
		 * adjust the test for too many entries, and report (entries +1)
		 * in the error as the maximum possible number of entries.
		 */
		if (lineno > entries + 1)
			WT_ERR_MSG(session, EINVAL,
			    "Huffman table file %.*s is corrupted, "
			    "more than %" PRIu32 " entries",
			    (int)ip->len, ip->str, entries + 1);
		if (n != 2)
			WT_ERR_MSG(session, EINVAL,
			    "line %u of Huffman table file %.*s is corrupted: "
			    "expected two unsigned integral values",
			    lineno, (int)ip->len, ip->str);
		if (symbol < 0 || symbol > entries)
			WT_ERR_MSG(session, EINVAL,
			    "line %u of Huffman file %.*s is corrupted; "
			    "symbol %" PRId64 " not in range, maximum "
			    "value is %u",
			    lineno, (int)ip->len, ip->str, symbol, entries);
		if (frequency < 0 || frequency > UINT32_MAX)
			WT_ERR_MSG(session, EINVAL,
			    "line %u of Huffman file %.*s is corrupted; "
			    "frequency %" PRId64 " not in range, maximum "
			    "value is %" PRIu32,
			    lineno, (int)ip->len, ip->str, frequency,
			    (uint32_t)UINT32_MAX);

		tp->symbol = (uint32_t)symbol;
		tp->frequency = (uint32_t)frequency;
	}

	*entriesp = lineno - 1;
	*tablep = table;

	if (0) {
err:		__wt_free(session, table);
	}
	(void)__wt_fclose(session, &fs);

	__wt_scr_free(session, &tmp);
	return (ret);
}
Example #20
0
/*
 * __wt_conn_optrack_setup --
 *     Set up operation logging.
 */
int
__wt_conn_optrack_setup(WT_SESSION_IMPL *session,
    const char *cfg[], bool reconfig)
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;

	conn = S2C(session);

	/* Once an operation tracking path has been set it can't be changed. */
	if (!reconfig) {
		WT_RET(__wt_config_gets(session,
		    cfg, "operation_tracking.path", &cval));
		WT_RET(__wt_strndup(session,
		    cval.str, cval.len, &conn->optrack_path));
	}

	WT_RET(__wt_config_gets(session,
	    cfg, "operation_tracking.enabled", &cval));
	if (cval.val == 0) {
		if (F_ISSET(conn, WT_CONN_OPTRACK)) {
			WT_RET(__wt_conn_optrack_teardown(session, reconfig));
			F_CLR(conn, WT_CONN_OPTRACK);
		}
		return (0);
	}
	if (F_ISSET(conn, WT_CONN_READONLY))
		/* Operation tracking isn't supported in read-only mode */
		WT_RET_MSG(session, EINVAL,
		    "Operation tracking is incompatible with read only "
		    "configuration.");
	if (F_ISSET(conn, WT_CONN_OPTRACK))
		/* Already enabled, nothing else to do */
		return (0);

	/*
	 * Operation tracking files will include the ID of the creating process
	 * in their name, so we can distinguish between log files created by
	 * different WiredTiger processes in the same directory. We cache the
	 * process id for future use.
	 */
	conn->optrack_pid = __wt_process_id();

	/*
	 * Open the file in the same directory that will hold a map of
	 * translations between function names and function IDs. If the file
	 * exists, remove it.
	 */
	WT_RET(__wt_scr_alloc(session, 0, &buf));
	WT_ERR(__wt_filename_construct(session, conn->optrack_path,
	    "optrack-map", conn->optrack_pid, UINT32_MAX, buf));
	WT_ERR(__wt_open(session,
	    (const char *)buf->data, WT_FS_OPEN_FILE_TYPE_REGULAR,
	    WT_FS_OPEN_CREATE, &conn->optrack_map_fh));

	WT_ERR(__wt_spin_init(session,
	    &conn->optrack_map_spinlock, "optrack map spinlock"));

	WT_ERR(__wt_malloc(session, WT_OPTRACK_BUFSIZE,
	    &conn->dummy_session.optrack_buf));

	/* Set operation tracking on */
	F_SET(conn, WT_CONN_OPTRACK);

err:	__wt_scr_free(session, &buf);
	return (ret);
}
Example #21
0
/*
 * __wt_curlog_open --
 *	Initialize a log cursor.
 */
int
__wt_curlog_open(WT_SESSION_IMPL *session,
    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __curlog_compare,			/* compare */
	    __wt_cursor_equals,			/* equals */
	    __curlog_next,			/* next */
	    __wt_cursor_notsup,			/* prev */
	    __curlog_reset,			/* reset */
	    __curlog_search,			/* search */
	    __wt_cursor_search_near_notsup,	/* search-near */
	    __wt_cursor_notsup,			/* insert */
	    __wt_cursor_notsup,			/* update */
	    __wt_cursor_notsup,			/* remove */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __curlog_close);			/* close */
	WT_CURSOR *cursor;
	WT_CURSOR_LOG *cl;
	WT_DECL_RET;
	WT_LOG *log;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0);
	conn = S2C(session);
	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
		WT_RET_MSG(session, EINVAL,
		    "Cannot open a log cursor without logging enabled");

	log = conn->log;
	cl = NULL;
	WT_RET(__wt_calloc_one(session, &cl));
	cursor = &cl->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	WT_ERR(__wt_calloc_one(session, &cl->cur_lsn));
	WT_ERR(__wt_calloc_one(session, &cl->next_lsn));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue));
	cursor->key_format = WT_LOGC_KEY_FORMAT;
	cursor->value_format = WT_LOGC_VALUE_FORMAT;

	WT_INIT_LSN(cl->cur_lsn);
	WT_INIT_LSN(cl->next_lsn);

	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));

	/*
	 * The user may be trying to read a log record they just wrote.
	 * Log records may be buffered, so force out any now.
	 */
	WT_ERR(__wt_log_force_write(session, 1));

	/* Log cursors block archiving. */
	WT_ERR(__wt_readlock(session, log->log_archive_lock));

	if (0) {
err:		if (F_ISSET(cursor, WT_CURSTD_OPEN))
			WT_TRET(cursor->close(cursor));
		else {
			__wt_free(session, cl->cur_lsn);
			__wt_free(session, cl->next_lsn);
			__wt_scr_free(session, &cl->logrec);
			__wt_scr_free(session, &cl->opkey);
			__wt_scr_free(session, &cl->opvalue);
			/*
			 * NOTE:  We cannot get on the error path with the
			 * readlock held.  No need to unlock it unless that
			 * changes above.
			 */
			__wt_free(session, cl);
		}
		*cursorp = NULL;
	}

	return (ret);
}
Example #22
0
/*
 * __wt_curjoin_open --
 *	Initialize a join cursor.
 *
 *	Join cursors are read-only.
 */
int
__wt_curjoin_open(WT_SESSION_IMPL *session,
    const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __curjoin_get_key,			/* get-key */
	    __curjoin_get_value,		/* get-value */
	    __wt_cursor_set_key_notsup,		/* set-key */
	    __wt_cursor_set_value_notsup,	/* set-value */
	    __wt_cursor_compare_notsup,		/* compare */
	    __wt_cursor_equals_notsup,		/* equals */
	    __curjoin_next,			/* next */
	    __wt_cursor_notsup,			/* prev */
	    __curjoin_reset,			/* reset */
	    __wt_cursor_notsup,			/* search */
	    __wt_cursor_search_near_notsup,	/* search-near */
	    __wt_cursor_notsup,			/* insert */
	    __wt_cursor_modify_notsup,		/* modify */
	    __wt_cursor_notsup,			/* update */
	    __wt_cursor_notsup,			/* remove */
	    __wt_cursor_notsup,			/* reserve */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __wt_cursor_notsup,			/* cache */
	    __wt_cursor_reopen_notsup,		/* reopen */
	    __curjoin_close);			/* close */
	WT_CURSOR *cursor;
	WT_CURSOR_JOIN *cjoin;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_TABLE *table;
	size_t size;
	const char *tablename, *columns;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_JOIN, iface) == 0);

	if (owner != NULL)
		WT_RET_MSG(session, EINVAL,
		    "unable to initialize a join cursor with existing owner");

	tablename = uri;
	if (!WT_PREFIX_SKIP(tablename, "join:table:"))
		return (
		    __wt_unexpected_object_type(session, uri, "join:table:"));

	columns = strchr(tablename, '(');
	if (columns == NULL)
		size = strlen(tablename);
	else
		size = WT_PTRDIFF(columns, tablename);
	WT_RET(__wt_schema_get_table(
	    session, tablename, size, false, 0, &table));

	WT_RET(__wt_calloc_one(session, &cjoin));
	cursor = (WT_CURSOR *)cjoin;
	*cursor = iface;
	cursor->session = (WT_SESSION *)session;
	cursor->key_format = table->key_format;
	cursor->value_format = table->value_format;

	cjoin->table = table;

	/* Handle projections. */
	WT_ERR(__wt_scr_alloc(session, 0, &tmp));
	if (columns != NULL) {
		WT_ERR(__wt_struct_reformat(session, table,
		    columns, strlen(columns), NULL, false, tmp));
		WT_ERR(__wt_strndup(
		    session, tmp->data, tmp->size, &cursor->value_format));
		WT_ERR(__wt_strdup(session, columns, &cjoin->projection));
	}

	WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));

	if (0) {
err:		WT_TRET(__curjoin_close(cursor));
		*cursorp = NULL;
	}

	__wt_scr_free(session, &tmp);
	return (ret);
}
Example #23
0
File: bt_io.c Project: qihsh/mongo
/*
 * __wt_bt_read --
 *	Read a cookie referenced block into a buffer.
 */
int
__wt_bt_read(WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_ITEM(etmp);
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_ENCRYPTOR *encryptor;
	WT_ITEM *ip;
	const WT_PAGE_HEADER *dsk;
	const char *fail_msg;
	size_t result_len;

	btree = S2BT(session);
	bm = btree->bm;
	fail_msg = NULL;			/* -Wuninitialized */

	/*
	 * If anticipating a compressed or encrypted block, read into a scratch
	 * buffer and decompress into the caller's buffer.  Else, read directly
	 * into the caller's buffer.
	 */
	if (btree->compressor == NULL && btree->kencryptor == NULL) {
		WT_RET(bm->read(bm, session, buf, addr, addr_size));
		dsk = buf->data;
		ip = NULL;
	} else {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
		dsk = tmp->data;
		ip = tmp;
	}

	/*
	 * If the block is encrypted, copy the skipped bytes of the original
	 * image into place, then decrypt.
	 */
	if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) {
		if (btree->kencryptor == NULL ||
		    (encryptor = btree->kencryptor->encryptor) == NULL ||
		    encryptor->decrypt == NULL) {
			fail_msg =
			    "encrypted block in file for which no encryption "
			    "configured";
			goto corrupt;
		}

		WT_ERR(__wt_scr_alloc(session, 0, &etmp));
		if ((ret = __wt_decrypt(session,
		    encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) {
			fail_msg = "block decryption failed";
			goto corrupt;
		}

		ip = etmp;
		dsk = ip->data;
	} else if (btree->kencryptor != NULL) {
		fail_msg =
		    "unencrypted block in file for which encryption configured";
		goto corrupt;
	}

	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
		if (btree->compressor == NULL ||
		    btree->compressor->decompress == NULL) {
			fail_msg =
			    "compressed block in file for which no compression "
			    "configured";
			goto corrupt;
		}

		/*
		 * Size the buffer based on the in-memory bytes we're expecting
		 * from decompression.
		 */
		WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size));

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, ip->data, WT_BLOCK_COMPRESS_SKIP);
		ret = btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)ip->data + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len);

		/*
		 * If checksums were turned off because we're depending on the
		 * decompression to fail on any corrupted data, we'll end up
		 * here after corruption happens.  If we're salvaging the file,
		 * it's OK, otherwise it's really, really bad.
		 */
		if (ret != 0 ||
		    result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) {
			fail_msg = "block decryption failed";
			goto corrupt;
		}
	} else
		/*
		 * If we uncompressed above, the page is in the correct buffer.
		 * If we get here the data may be in the wrong buffer and the
		 * buffer may be the wrong size.  If needed, get the page
		 * into the destination buffer.
		 */
		if (ip != NULL)
			WT_ERR(__wt_buf_set(
			    session, buf, ip->data, dsk->mem_size));

	/* If the handle is a verify handle, verify the physical page. */
	if (F_ISSET(btree, WT_BTREE_VERIFY)) {
		if (tmp == NULL)
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
		WT_ERR(__wt_verify_dsk(session, tmp->data, buf));
	}

	WT_STAT_FAST_CONN_INCR(session, cache_read);
	WT_STAT_FAST_DATA_INCR(session, cache_read);
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
		WT_STAT_FAST_DATA_INCR(session, compress_read);
	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);

	if (0) {
corrupt:	if (ret == 0)
			ret = WT_ERROR;
		if (!F_ISSET(btree, WT_BTREE_VERIFY) &&
		    !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) {
			__wt_err(session, ret, "%s", fail_msg);
			ret = __wt_illegal_value(session, btree->dhandle->name);
		}
	}

err:	__wt_scr_free(session, &tmp);
	__wt_scr_free(session, &etmp);
	return (ret);
}
Example #24
0
/*
 * __curjoin_init_bloom --
 *	Populate Bloom filters
 */
static int
__curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_CURSOR_JOIN_ENTRY *entry, WT_BLOOM *bloom)
{
	WT_COLLATOR *collator;
	WT_CURSOR *c;
	WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
	WT_DECL_ITEM(uribuf);
	WT_DECL_RET;
	WT_ITEM curkey, curvalue;
	size_t size;
	u_int skip;
	int cmp;
	const char *uri;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };

	c = NULL;
	skip = 0;

	if (entry->index != NULL)
		/*
		 * Open the raw index.  We're avoiding any references
		 * to the main table, they may be expensive.
		 */
		uri = entry->index->source;
	else {
		/*
		 * For joins on the main table, we just need the primary
		 * key for comparison, we don't need any values.
		 */
		size = strlen(cjoin->table->iface.name) + 3;
		WT_ERR(__wt_scr_alloc(session, size, &uribuf));
		WT_ERR(__wt_buf_fmt(session, uribuf, "%s()",
		    cjoin->table->iface.name));
		uri = uribuf->data;
	}
	WT_ERR(__wt_open_cursor(session, uri, &cjoin->iface, raw_cfg, &c));

	/* Initially position the cursor if necessary. */
	endmax = &entry->ends[entry->ends_next];
	if ((end = &entry->ends[0]) < endmax) {
		if (F_ISSET(end, WT_CURJOIN_END_GT) ||
		    WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ) {
			WT_ERR(__wt_cursor_dup_position(end->cursor, c));
			if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE)
				skip = 1;
		} else if (F_ISSET(end, WT_CURJOIN_END_LT)) {
			if ((ret = c->next(c)) == WT_NOTFOUND)
				goto done;
			WT_ERR(ret);
		} else
			WT_PANIC_ERR(session, EINVAL,
			    "fatal error in join cursor position state");
	}
	collator = (entry->index == NULL) ? NULL : entry->index->collator;
	while (ret == 0) {
		WT_ERR(c->get_key(c, &curkey));
		entry->stats.iterated++;
		if (entry->index != NULL) {
			/*
			 * Repack so it's comparable to the
			 * reference endpoints.
			 */
			WT_ERR(__wt_struct_repack(session,
			    c->key_format,
			    (entry->repack_format != NULL ?
			    entry->repack_format : entry->index->idxkey_format),
			    &c->key, &curkey));
		}
		for (end = &entry->ends[skip]; end < endmax; end++) {
			WT_ERR(__wt_compare(session, collator, &curkey,
			    &end->key, &cmp));
			if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) {
				/* if condition satisfied, insert immediately */
				switch (WT_CURJOIN_END_RANGE(end)) {
				case WT_CURJOIN_END_EQ:
					if (cmp == 0)
						goto insert;
					break;
				case WT_CURJOIN_END_GT:
					if (cmp > 0) {
						/* skip this check next time */
						skip = entry->ends_next;
						goto insert;
					}
					break;
				case WT_CURJOIN_END_GE:
					if (cmp >= 0)
						goto insert;
					break;
				case WT_CURJOIN_END_LT:
					if (cmp < 0)
						goto insert;
					break;
				case WT_CURJOIN_END_LE:
					if (cmp <= 0)
						goto insert;
					break;
				}
			} else if (!F_ISSET(end, WT_CURJOIN_END_LT)) {
				if (cmp < 0 || (cmp == 0 &&
				    !F_ISSET(end, WT_CURJOIN_END_EQ)))
					goto advance;
				if (cmp > 0) {
					if (F_ISSET(end, WT_CURJOIN_END_GT))
						skip = 1;
					else
						goto done;
				}
			} else {
				if (cmp > 0 || (cmp == 0 &&
				    !F_ISSET(end, WT_CURJOIN_END_EQ)))
					goto done;
			}
		}
		/*
		 * Either it's a disjunction that hasn't satisfied any
		 * condition, or it's a conjunction that has satisfied all
		 * conditions.
		 */
		if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
			goto advance;
insert:
		if (entry->index != NULL) {
			curvalue.data =
			    (unsigned char *)curkey.data + curkey.size;
			WT_ASSERT(session, c->key.size > curkey.size);
			curvalue.size = c->key.size - curkey.size;
		}
		else
			WT_ERR(c->get_key(c, &curvalue));
		__wt_bloom_insert(bloom, &curvalue);
		entry->stats.bloom_insert++;
advance:
		if ((ret = c->next(c)) == WT_NOTFOUND)
			break;
	}
done:
	WT_ERR_NOTFOUND_OK(ret);

err:	if (c != NULL)
		WT_TRET(c->close(c));
	__wt_scr_free(session, &uribuf);
	return (ret);
}
Example #25
0
/*
 * __wt_win_directory_list --
 *	Get a list of files from a directory, MSVC version.
 */
int
__wt_win_directory_list(WT_FILE_SYSTEM *file_system,
    WT_SESSION *wt_session, const char *directory,
    const char *prefix, char ***dirlistp, uint32_t *countp)
{
	DWORD windows_error;
	HANDLE findhandle;
	WIN32_FIND_DATAW finddata;
	WT_DECL_ITEM(pathbuf);
	WT_DECL_ITEM(file_utf8);
	WT_DECL_ITEM(pathbuf_wide);
	WT_DECL_ITEM(prefix_wide);
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	size_t dirallocsz, pathlen, prefix_widelen;
	uint32_t count;
	char *dir_copy, **entries;

	session = (WT_SESSION_IMPL *)wt_session;

	*dirlistp = NULL;
	*countp = 0;

	findhandle = INVALID_HANDLE_VALUE;
	dirallocsz = 0;
	entries = NULL;

	WT_ERR(__wt_strdup(session, directory, &dir_copy));
	pathlen = strlen(dir_copy);
	if (dir_copy[pathlen - 1] == '\\')
		dir_copy[pathlen - 1] = '\0';
	WT_ERR(__wt_scr_alloc(session, pathlen + 3, &pathbuf));
	WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", dir_copy));

	WT_ERR(__wt_to_utf16_string(session, pathbuf->data, &pathbuf_wide));
	WT_ERR(__wt_to_utf16_string(session, prefix, &prefix_wide));
	prefix_widelen = wcslen(prefix_wide->data);

	findhandle = FindFirstFileW(pathbuf_wide->data, &finddata);
	if (findhandle == INVALID_HANDLE_VALUE) {
		windows_error = __wt_getlasterror();
		__wt_errx(session,
		    "%s: directory-list: FindFirstFile: %s",
		    pathbuf->data, __wt_formatmessage(session, windows_error));
		WT_ERR(__wt_map_windows_error(windows_error));
	}

	count = 0;
	do {
		/*
		 * Skip . and ..
		 */
		if (wcscmp(finddata.cFileName, L".") == 0 ||
		    wcscmp(finddata.cFileName, L"..") == 0)
			continue;

		/* The list of files is optionally filtered by a prefix. */
		if (prefix != NULL &&
		    wcsncmp(finddata.cFileName, prefix_wide->data,
			prefix_widelen) != 0)
			continue;

		WT_ERR(__wt_realloc_def(
		    session, &dirallocsz, count + 1, &entries));

		WT_ERR(__wt_to_utf8_string(
		    session, finddata.cFileName, &file_utf8));
		WT_ERR(__wt_strdup(session, file_utf8->data, &entries[count]));
		++count;
		__wt_scr_free(session, &file_utf8);
	} while (FindNextFileW(findhandle, &finddata) != 0);

	*dirlistp = entries;
	*countp = count;

err:	if (findhandle != INVALID_HANDLE_VALUE)
		if (FindClose(findhandle) == 0) {
			windows_error = __wt_getlasterror();
			__wt_errx(session,
			    "%s: directory-list: FindClose: %s",
			    pathbuf->data,
			    __wt_formatmessage(session, windows_error));
			if (ret == 0)
				ret = __wt_map_windows_error(windows_error);
		}

	__wt_free(session, dir_copy);
	__wt_scr_free(session, &pathbuf);
	__wt_scr_free(session, &file_utf8);
	__wt_scr_free(session, &pathbuf_wide);
	__wt_scr_free(session, &prefix_wide);

	if (ret == 0)
		return (0);

	WT_TRET(__wt_win_directory_list_free(
	    file_system, wt_session, entries, count));

	WT_RET_MSG(session, ret,
	    "%s: directory-list, prefix \"%s\"",
	    directory, prefix == NULL ? "" : prefix);
}
Example #26
0
/*
 * __wt_las_sweep --
 *	Sweep the lookaside table.
 */
int
__wt_las_sweep(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *cursor;
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_RET;
	WT_ITEM *key;
	uint64_t cnt, las_counter, las_txnid;
	int64_t remove_cnt;
	uint32_t las_id, session_flags;
	int notused;

	conn = S2C(session);
	cursor = NULL;
	key = &conn->las_sweep_key;
	remove_cnt = 0;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));

	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * If we're not starting a new sweep, position the cursor using the key
	 * from the last call (we don't care if we're before or after the key,
	 * just roughly in the same spot is fine).
	 */
	if (key->size != 0) {
		__wt_cursor_set_raw_key(cursor, key);
		ret = cursor->search_near(cursor, &notused);

		/*
		 * Don't search for the same key twice; if we don't set a new
		 * key below, it's because we've reached the end of the table
		 * and we want the next pass to start at the beginning of the
		 * table. Searching for the same key could leave us stuck at
		 * the end of the table, repeatedly checking the same rows.
		 */
		key->size = 0;
		if (ret != 0)
			goto srch_notfound;
	}

	/*
	 * The sweep server wakes up every 10 seconds (by default), it's a slow
	 * moving thread. Try to review the entire lookaside table once every 5
	 * minutes, or every 30 calls.
	 *
	 * The reason is because the lookaside table exists because we're seeing
	 * cache/eviction pressure (it allows us to trade performance and disk
	 * space for cache space), and it's likely lookaside blocks are being
	 * evicted, and reading them back in doesn't help things. A trickier,
	 * but possibly better, alternative might be to review all lookaside
	 * blocks in the cache in order to get rid of them, and slowly review
	 * lookaside blocks that have already been evicted.
	 */
	cnt = (uint64_t)WT_MAX(100, conn->las_record_cnt / 30);

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	/* Walk the file. */
	for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
		/*
		 * If the loop terminates after completing a work unit, we will
		 * continue the table sweep next time. Get a local copy of the
		 * sweep key, we're going to reset the cursor; do so before
		 * calling cursor.remove, cursor.remove can discard our hazard
		 * pointer and the page could be evicted from underneath us.
		 */
		if (cnt == 1) {
			WT_ERR(__wt_cursor_get_raw_key(cursor, key));
			if (!WT_DATA_IN_ITEM(key))
				WT_ERR(__wt_buf_set(
				    session, key, key->data, key->size));
		}

		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * If the on-page record transaction ID associated with the
		 * record is globally visible, the record can be discarded.
		 *
		 * Cursor opened overwrite=true: won't return WT_NOTFOUND should
		 * another thread remove the record before we do, and the cursor
		 * remains positioned in that case.
		 */
		if (__wt_txn_visible_all(session, las_txnid)) {
			WT_ERR(cursor->remove(cursor));
			++remove_cnt;
		}
	}

srch_notfound:
	WT_ERR_NOTFOUND_OK(ret);

	if (0) {
err:		__wt_buf_free(session, key);
	}

	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));

	/*
	 * If there were races to remove records, we can over-count.  All
	 * arithmetic is signed, so underflow isn't fatal, but check anyway so
	 * we don't skew low over time.
	 */
	if (remove_cnt > S2C(session)->las_record_cnt)
		S2C(session)->las_record_cnt = 0;
	else if (remove_cnt > 0)
		(void)__wt_atomic_subi64(&conn->las_record_cnt, remove_cnt);

	F_CLR(session, WT_SESSION_NO_CACHE);

	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);

	return (ret);
}
/*
 * __config_merge_scan --
 *	Walk a configuration string, inserting entries into the merged array.
 */
static int
__config_merge_scan(WT_SESSION_IMPL *session,
    const char *key, const char *value, WT_CONFIG_MERGE *cp)
{
	WT_CONFIG cparser;
	WT_CONFIG_ITEM k, v;
	WT_DECL_ITEM(kb);
	WT_DECL_ITEM(vb);
	WT_DECL_RET;
	size_t len;

	WT_ERR(__wt_scr_alloc(session, 0, &kb));
	WT_ERR(__wt_scr_alloc(session, 0, &vb));

	WT_ERR(__wt_config_init(session, &cparser, value));
	while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
		if (k.type != WT_CONFIG_ITEM_STRING &&
		    k.type != WT_CONFIG_ITEM_ID)
			WT_ERR_MSG(session, EINVAL,
			    "Invalid configuration key found: '%s'\n", k.str);

		/* Include the quotes around string keys/values. */
		if (k.type == WT_CONFIG_ITEM_STRING) {
			--k.str;
			k.len += 2;
		}
		if (v.type == WT_CONFIG_ITEM_STRING) {
			--v.str;
			v.len += 2;
		}

		/*
		 * !!!
		 * We're using a JSON quote character to separate the names we
		 * create for nested structures. That's not completely safe as
		 * it's possible to quote characters in JSON such that a quote
		 * character appears as a literal character in a key name. In
		 * a few cases, applications can create their own key namespace
		 * (for example, shared library extension names), and therefore
		 * it's possible for an application to confuse us. Error if we
		 * we ever see a key with a magic character.
		 */
		for (len = 0; len < k.len; ++len)
			if (k.str[len] == SEPC)
				WT_ERR_MSG(session, EINVAL,
				    "key %.*s contains a '%c' separator "
				    "character",
				    (int)k.len, (char *)k.str, SEPC);

		/* Build the key/value strings. */
		WT_ERR(__wt_buf_fmt(session,
		    kb, "%s%s%.*s",
		    key == NULL ? "" : key,
		    key == NULL ? "" : SEP,
		    (int)k.len, k.str));
		WT_ERR(__wt_buf_fmt(session,
		    vb, "%.*s", (int)v.len, v.str));

		/*
		 * If the value is a structure, recursively parse it.
		 *
		 * !!!
		 * Don't merge unless the structure has field names. WiredTiger
		 * stores checkpoint LSNs in the metadata file using nested
		 * structures without field names: "checkpoint_lsn=(1,0)", not
		 * "checkpoint_lsn=(file=1,offset=0)". The value type is still
		 * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the
		 * value.
		 */
		if (v.type == WT_CONFIG_ITEM_STRUCT &&
		    strchr(vb->data, '=') != NULL) {
			WT_ERR(__config_merge_scan(
			    session, kb->data, vb->data, cp));
			continue;
		}

		/* Insert the value into the array. */
		WT_ERR(__wt_realloc_def(session,
		    &cp->entries_allocated,
		    cp->entries_next + 1, &cp->entries));
		WT_ERR(__wt_strndup(session,
		    kb->data, kb->size, &cp->entries[cp->entries_next].k));
		WT_ERR(__wt_strndup(session,
		    vb->data, vb->size, &cp->entries[cp->entries_next].v));
		cp->entries[cp->entries_next].gen = cp->entries_next;
		++cp->entries_next;
	}
	WT_ERR_NOTFOUND_OK(ret);

err:	__wt_scr_free(session, &kb);
	__wt_scr_free(session, &vb);
	return (ret);
}
Example #28
0
/*
 * __create_file --
 *	Create a new 'file:' object.
 */
static int
__create_file(WT_SESSION_IMPL *session,
    const char *uri, int exclusive, const char *config)
{
	WT_DECL_ITEM(val);
	WT_DECL_RET;
	uint32_t allocsize;
	int is_metadata;
	const char *filename, **p, *filecfg[] =
	    { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL };
	char *fileconf;

	fileconf = NULL;

	is_metadata = strcmp(uri, WT_METAFILE_URI) == 0;

	filename = uri;
	if (!WT_PREFIX_SKIP(filename, "file:"))
		WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri);

	/* Check if the file already exists. */
	if (!is_metadata && (ret =
	    __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) {
		if (exclusive)
			WT_TRET(EEXIST);
		goto err;
	}

	/* Sanity check the allocation size. */
	WT_RET(__wt_direct_io_size_check(
	    session, filecfg, "allocation_size", &allocsize));

	/* Create the file. */
	WT_ERR(__wt_block_manager_create(session, filename, allocsize));
	if (WT_META_TRACKING(session))
		WT_ERR(__wt_meta_track_fileop(session, NULL, uri));

	/*
	 * If creating an ordinary file, append the file ID and current version
	 * numbers to the passed-in configuration and insert the resulting
	 * configuration into the metadata.
	 */
	if (!is_metadata) {
		WT_ERR(__wt_scr_alloc(session, 0, &val));
		WT_ERR(__wt_buf_fmt(session, val,
		    "id=%" PRIu32 ",version=(major=%d,minor=%d)",
		    ++S2C(session)->next_file_id,
		    WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
		for (p = filecfg; *p != NULL; ++p)
			;
		*p = val->data;
		WT_ERR(__wt_config_collapse(session, filecfg, &fileconf));
		WT_ERR(__wt_metadata_insert(session, uri, fileconf));
	}

	/*
	 * Open the file to check that it was setup correctly. We don't need to
	 * pass the configuration, we just wrote the collapsed configuration
	 * into the metadata file, and it's going to be read/used by underlying
	 * functions.
	 *
	 * Keep the handle exclusive until it is released at the end of the
	 * call, otherwise we could race with a drop.
	 */
	WT_ERR(__wt_session_get_btree(
	    session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
	if (WT_META_TRACKING(session))
		WT_ERR(__wt_meta_track_handle_lock(session, 1));
	else
		WT_ERR(__wt_session_release_btree(session));

err:	__wt_scr_free(session, &val);
	__wt_free(session, fileconf);
	return (ret);
}
Example #29
0
/*
 * __wt_meta_ckptlist_set --
 *	Set a file's checkpoint value from the WT_CKPT list.
 */
int
__wt_meta_ckptlist_set(WT_SESSION_IMPL *session,
    const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn)
{
	WT_CKPT *ckpt;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	time_t secs;
	int64_t maxorder;
	const char *sep;

	WT_ERR(__wt_scr_alloc(session, 0, &buf));
	maxorder = 0;
	sep = "";
	WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=("));
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		/*
		 * Each internal checkpoint name is appended with a generation
		 * to make it a unique name.  We're solving two problems: when
		 * two checkpoints are taken quickly, the timer may not be
		 * unique and/or we can even see time travel on the second
		 * checkpoint if we snapshot the time in-between nanoseconds
		 * rolling over.  Second, if we reset the generational counter
		 * when new checkpoints arrive, we could logically re-create
		 * specific checkpoints, racing with cursors open on those
		 * checkpoints.  I can't think of any way to return incorrect
		 * results by racing with those cursors, but it's simpler not
		 * to worry about it.
		 */
		if (ckpt->order > maxorder)
			maxorder = ckpt->order;

		/* Skip deleted checkpoints. */
		if (F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;

		if (F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_UPDATE)) {
			/*
			 * We fake checkpoints for handles in the middle of a
			 * bulk load.  If there is a checkpoint, convert the
			 * raw cookie to a hex string.
			 */
			if (ckpt->raw.size == 0)
				ckpt->addr.size = 0;
			else
				WT_ERR(__wt_raw_to_hex(session,
				    ckpt->raw.data,
				    ckpt->raw.size, &ckpt->addr));

			/* Set the order and timestamp. */
			if (F_ISSET(ckpt, WT_CKPT_ADD))
				ckpt->order = ++maxorder;

			/*
			 * XXX
			 * Assumes a time_t fits into a uintmax_t, which isn't
			 * guaranteed, a time_t has to be an arithmetic type,
			 * but not an integral type.
			 */
			__wt_seconds(session, &secs);
			ckpt->sec = (uintmax_t)secs;
		}
		if (strcmp(ckpt->name, WT_CHECKPOINT) == 0)
			WT_ERR(__wt_buf_catfmt(session, buf,
			    "%s%s.%" PRId64 "=(addr=\"%.*s\",order=%" PRId64
			    ",time=%" PRIuMAX ",size=%" PRIu64
			    ",write_gen=%" PRIu64 ")",
			    sep, ckpt->name, ckpt->order,
			    (int)ckpt->addr.size, (char *)ckpt->addr.data,
			    ckpt->order, ckpt->sec, ckpt->ckpt_size,
			    ckpt->write_gen));
		else
			WT_ERR(__wt_buf_catfmt(session, buf,
			    "%s%s=(addr=\"%.*s\",order=%" PRId64
			    ",time=%" PRIuMAX ",size=%" PRIu64
			    ",write_gen=%" PRIu64 ")",
			    sep, ckpt->name,
			    (int)ckpt->addr.size, (char *)ckpt->addr.data,
			    ckpt->order, ckpt->sec, ckpt->ckpt_size,
			    ckpt->write_gen));
		sep = ",";
	}
	WT_ERR(__wt_buf_catfmt(session, buf, ")"));
	if (ckptlsn != NULL)
		WT_ERR(__wt_buf_catfmt(session, buf,
		    ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")",
		    ckptlsn->l.file, (uintmax_t)ckptlsn->l.offset));
	WT_ERR(__ckpt_set(session, fname, buf->mem));

err:	__wt_scr_free(session, &buf);
	return (ret);
}
Example #30
0
/*
 * __wt_curlog_open --
 *	Initialize a log cursor.
 */
int
__wt_curlog_open(WT_SESSION_IMPL *session,
    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __curlog_compare,			/* compare */
	    __wt_cursor_equals,			/* equals */
	    __curlog_next,			/* next */
	    __wt_cursor_notsup,			/* prev */
	    __curlog_reset,			/* reset */
	    __curlog_search,			/* search */
	    __wt_cursor_search_near_notsup,	/* search-near */
	    __wt_cursor_notsup,			/* insert */
	    __wt_cursor_modify_notsup,		/* modify */
	    __wt_cursor_notsup,			/* update */
	    __wt_cursor_notsup,			/* remove */
	    __wt_cursor_notsup,			/* reserve */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __wt_cursor_notsup,			/* cache */
	    __wt_cursor_reopen_notsup,		/* reopen */
	    __curlog_close);			/* close */
	WT_CURSOR *cursor;
	WT_CURSOR_LOG *cl;
	WT_DECL_RET;
	WT_LOG *log;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0);

	conn = S2C(session);
	log = conn->log;

	WT_RET(__wt_calloc_one(session, &cl));
	cursor = (WT_CURSOR *)cl;
	*cursor = iface;
	cursor->session = (WT_SESSION *)session;
	cursor->key_format = WT_LOGC_KEY_FORMAT;
	cursor->value_format = WT_LOGC_VALUE_FORMAT;

	WT_ERR(__wt_calloc_one(session, &cl->cur_lsn));
	WT_ERR(__wt_calloc_one(session, &cl->next_lsn));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue));
	WT_INIT_LSN(cl->cur_lsn);
	WT_INIT_LSN(cl->next_lsn);

	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));

	if (log != NULL) {
		/*
		 * The user may be trying to read a log record they just wrote.
		 * Log records may be buffered, so force out any now.
		 */
		WT_ERR(__wt_log_force_write(session, 1, NULL));

		/* Log cursors block archiving. */
		__wt_readlock(session, &log->log_archive_lock);
		F_SET(cl, WT_CURLOG_ARCHIVE_LOCK);
		(void)__wt_atomic_add32(&conn->log_cursors, 1);

	}

	if (0) {
err:		WT_TRET(__curlog_close(cursor));
		*cursorp = NULL;
	}

	return (ret);
}