Exemplo n.º 1
0
/*
 * __wt_txn_config --
 *	Configure a transaction.
 */
int
__wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_TXN *txn;

	txn = &session->txn;

	WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
	if (cval.len != 0)
		txn->isolation =
		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
		    WT_ISO_SNAPSHOT :
		    WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
		    WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED;

	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 *
	 * We want to distinguish between inheriting implicitly and explicitly.
	 */
	F_CLR(txn, WT_TXN_SYNC_SET);
	WT_RET(__wt_config_gets_def(
	    session, cfg, "sync", (int)UINT_MAX, &cval));
	if (cval.val == 0 || cval.val == 1)
		/*
		 * This is an explicit setting of sync.  Set the flag so
		 * that we know not to overwrite it in commit_transaction.
		 */
		F_SET(txn, WT_TXN_SYNC_SET);

	/*
	 * If sync is turned off explicitly, clear the transaction's sync field.
	 */
	if (cval.val == 0)
		txn->txn_logsync = 0;

	WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval));
	if (cval.len > 0)
		/*
		 * The layering here isn't ideal - the named snapshot get
		 * function does both validation and setup. Otherwise we'd
		 * need to walk the list of named snapshots twice during
		 * transaction open.
		 */
		WT_RET(__wt_txn_named_snapshot_get(session, &cval));

	return (0);
}
Exemplo n.º 2
0
/*
 * __wt_txn_begin --
 *	Begin a transaction.
 */
int
__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *txn_state;

	conn = S2C(session);
	txn = &session->txn;
	txn_global = &conn->txn_global;
	txn_state = &txn_global->states[session->id];

	WT_ASSERT(session, txn_state->id == WT_TXN_NONE);

	WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
	if (cval.len == 0)
		txn->isolation = session->isolation;
	else
		txn->isolation =
		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
		    TXN_ISO_SNAPSHOT :
		    WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
		    TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED;

	/*
	 * Allocate a transaction ID.
	 *
	 * We use an atomic compare and swap to ensure that we get a
	 * unique ID that is published before the global counter is
	 * updated.
	 *
	 * If two threads race to allocate an ID, only the latest ID
	 * will proceed.  The winning thread can be sure its snapshot
	 * contains all of the earlier active IDs.  Threads that race
	 * and get an earlier ID may not appear in the snapshot, but
	 * they will loop and allocate a new ID before proceeding to
	 * make any updates.
	 *
	 * This potentially wastes transaction IDs when threads race to
	 * begin transactions: that is the price we pay to keep this
	 * path latch free.
	 */
	do {
		txn_state->id = txn->id = txn_global->current;
	} while (!WT_ATOMIC_CAS(txn_global->current, txn->id, txn->id + 1));

	/*
	 * If we have used 64-bits of transaction IDs, there is nothing
	 * more we can do.
	 */
	if (txn->id == WT_TXN_ABORTED)
		WT_RET_MSG(session, ENOMEM, "Out of transaction IDs");

	F_SET(txn, TXN_RUNNING);
	if (txn->isolation == TXN_ISO_SNAPSHOT)
		__wt_txn_refresh(session, WT_TXN_NONE, 1);
	return (0);
}
Exemplo n.º 3
0
/*
 * __wt_curfile_open --
 *	WT_SESSION->open_cursor method for the btree cursor type.
 */
int
__wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
                  WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
{
    WT_CONFIG_ITEM cval;
    WT_DECL_RET;
    uint32_t flags;
    bool bitmap, bulk;

    bitmap = bulk = false;
    flags = 0;

    WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
    if (cval.type == WT_CONFIG_ITEM_BOOL ||
            (cval.type == WT_CONFIG_ITEM_NUM &&
             (cval.val == 0 || cval.val == 1))) {
        bitmap = false;
        bulk = cval.val != 0;
    } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len))
        bitmap = bulk = true;
    else
        WT_RET_MSG(session, EINVAL,
                   "Value for 'bulk' must be a boolean or 'bitmap'");

    /* Bulk handles require exclusive access. */
    if (bulk)
        LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE);

    /* Get the handle and lock it while the cursor is using it. */
    if (WT_PREFIX_MATCH(uri, "file:")) {
        /*
         * If we are opening a bulk cursor, get the handle while
         * holding the checkpoint lock.  This prevents a bulk cursor
         * open failing with EBUSY due to a database-wide checkpoint.
         */
        if (bulk)
            __wt_spin_lock(
                session, &S2C(session)->checkpoint_lock);
        ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags);
        if (bulk)
            __wt_spin_unlock(
                session, &S2C(session)->checkpoint_lock);
        WT_RET(ret);
    } else
        WT_RET(__wt_bad_object_type(session, uri));

    WT_ERR(__wt_curfile_create(session, owner, cfg, bulk, bitmap, cursorp));

    /* Increment the data-source's in-use counter. */
    __wt_cursor_dhandle_incr_use(session);
    return (0);

err:	/* If the cursor could not be opened, release the handle. */
    WT_TRET(__wt_session_release_btree(session));
    return (ret);
}
Exemplo n.º 4
0
/*
 * __logmgr_config --
 *	Parse and setup the logging server options.
 */
static int
__logmgr_config(WT_SESSION_IMPL *session, const char **cfg, int *runp)
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;

	conn = S2C(session);

	/*
	 * The logging configuration is off by default.
	 */
	WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
	*runp = cval.val != 0;

	/*
	 * Setup a log path, compression and encryption even if logging is
	 * disabled in case we are going to print a log.
	 */
	conn->log_compressor = NULL;
	WT_RET(__wt_config_gets_none(session, cfg, "log.compressor", &cval));
	WT_RET(__wt_compressor_config(session, &cval, &conn->log_compressor));

	WT_RET(__wt_config_gets(session, cfg, "log.path", &cval));
	WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->log_path));

	/* We are done if logging isn't enabled. */
	if (*runp == 0)
		return (0);

	WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval));
	if (cval.val != 0)
		FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE);

	WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
	conn->log_file_max = (wt_off_t)cval.val;
	WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max);

	WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval));
	/*
	 * If pre-allocation is configured, set the initial number to one.
	 * We'll adapt as load dictates.
	 */
	if (cval.val != 0) {
		FLD_SET(conn->log_flags, WT_CONN_LOG_PREALLOC);
		conn->log_prealloc = 1;
	}
	WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval));
	if (cval.len != 0  && WT_STRING_MATCH("error", cval.str, cval.len))
		FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR);

	WT_RET(__logmgr_sync_cfg(session, cfg));
	return (0);
}
Exemplo n.º 5
0
/*
 * __wt_schema_drop --
 *	Process a WT_SESSION::drop operation for all supported types.
 */
int
__wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
{
    WT_CONFIG_ITEM cval;
    WT_DATA_SOURCE *dsrc;
    WT_DECL_RET;
    bool force;

    WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
    force = cval.val != 0;

    WT_RET(__wt_meta_track_on(session));

    /* Paranoia: clear any handle from our caller. */
    session->dhandle = NULL;

    if (WT_PREFIX_MATCH(uri, "colgroup:"))
        ret = __drop_colgroup(session, uri, force, cfg);
    else if (WT_PREFIX_MATCH(uri, "file:"))
        ret = __drop_file(session, uri, force, cfg);
    else if (WT_PREFIX_MATCH(uri, "index:"))
        ret = __drop_index(session, uri, force, cfg);
    else if (WT_PREFIX_MATCH(uri, "lsm:"))
        ret = __wt_lsm_tree_drop(session, uri, cfg);
    else if (WT_PREFIX_MATCH(uri, "table:"))
        ret = __drop_table(session, uri, cfg);
    else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
        ret = dsrc->drop == NULL ?
              __wt_object_unsupported(session, uri) :
              dsrc->drop(
                  dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg);
    else
        ret = __wt_bad_object_type(session, uri);

    /*
     * Map WT_NOTFOUND to ENOENT, based on the assumption WT_NOTFOUND means
     * there was no metadata entry.  Map ENOENT to zero if force is set.
     */
    if (ret == WT_NOTFOUND || ret == ENOENT)
        ret = force ? 0 : ENOENT;

    /* Bump the schema generation so that stale data is ignored. */
    ++S2C(session)->schema_gen;

    WT_TRET(__wt_meta_track_off(session, true, ret != 0));

    return (ret);
}
Exemplo n.º 6
0
/*
 * __wt_curfile_create --
 *	Open a cursor for a given btree handle.
 */
int
__wt_curfile_create(WT_SESSION_IMPL *session,
    WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap,
    WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,	/* get-key */
	    __wt_cursor_get_value,	/* get-value */
	    __wt_cursor_set_key,	/* set-key */
	    __wt_cursor_set_value,	/* set-value */
	    __curfile_compare,		/* compare */
	    __curfile_equals,		/* equals */
	    __curfile_next,		/* next */
	    __curfile_prev,		/* prev */
	    __curfile_reset,		/* reset */
	    __curfile_search,		/* search */
	    __curfile_search_near,	/* search-near */
	    __curfile_insert,		/* insert */
	    __curfile_update,		/* update */
	    __curfile_remove,		/* remove */
	    __wt_cursor_reconfigure,	/* reconfigure */
	    __curfile_close);		/* close */
	WT_BTREE *btree;
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE *cbt;
	WT_CURSOR_BULK *cbulk;
	WT_DECL_RET;
	size_t csize;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);

	cbt = NULL;

	btree = S2BT(session);
	WT_ASSERT(session, btree != NULL);

	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
	WT_RET(__wt_calloc(session, 1, csize, &cbt));

	cursor = &cbt->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->internal_uri = btree->dhandle->name;
	cursor->key_format = btree->key_format;
	cursor->value_format = btree->value_format;
	cbt->btree = btree;

	if (bulk) {
		F_SET(cursor, WT_CURSTD_BULK);

		cbulk = (WT_CURSOR_BULK *)cbt;

		/* Optionally skip the validation of each bulk-loaded key. */
		WT_ERR(__wt_config_gets_def(
		    session, cfg, "skip_sort_check", 0, &cval));
		WT_ERR(__wt_curbulk_init(
		    session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
	}

	/*
	 * random_retrieval
	 * Random retrieval cursors only support next, reset and close.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
	if (cval.val != 0) {
		__wt_cursor_set_notsup(cursor);
		cursor->next = __curfile_next_random;
		cursor->reset = __curfile_reset;
	}

	/* Underlying btree initialization. */
	__wt_btcur_open(cbt);

	/* __wt_cursor_init is last so we don't have to clean up on error. */
	WT_ERR(__wt_cursor_init(
	    cursor, cursor->internal_uri, owner, cfg, cursorp));

	WT_STAT_FAST_CONN_INCR(session, cursor_create);
	WT_STAT_FAST_DATA_INCR(session, cursor_create);

	if (0) {
err:		__wt_free(session, cbt);
	}

	return (ret);
}
Exemplo n.º 7
0
/*
 * __logmgr_config --
 *	Parse and setup the logging server options.
 */
static int
__logmgr_config(
    WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool reconfig)
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	bool enabled;

	conn = S2C(session);

	WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
	enabled = cval.val != 0;

	/*
	 * If we're reconfiguring, enabled must match the already
	 * existing setting.
	 *
	 * If it is off and the user it turning it on, or it is on
	 * and the user is turning it off, return an error.
	 */
	if (reconfig &&
	    ((enabled && !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) ||
	    (!enabled && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))))
		return (EINVAL);

	/* Logging is incompatible with in-memory */
	if (enabled) {
		WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
		if (cval.val != 0)
			WT_RET_MSG(session, EINVAL,
			    "In memory configuration incompatible with "
			    "log=(enabled=true)");
	}

	*runp = enabled;

	/*
	 * Setup a log path and compression even if logging is disabled in case
	 * we are going to print a log.  Only do this on creation.  Once a
	 * compressor or log path are set they cannot be changed.
	 */
	if (!reconfig) {
		conn->log_compressor = NULL;
		WT_RET(__wt_config_gets_none(
		    session, cfg, "log.compressor", &cval));
		WT_RET(__wt_compressor_config(
		    session, &cval, &conn->log_compressor));

		WT_RET(__wt_config_gets(session, cfg, "log.path", &cval));
		WT_RET(__wt_strndup(
		    session, cval.str, cval.len, &conn->log_path));
	}
	/* We are done if logging isn't enabled. */
	if (!*runp)
		return (0);

	WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval));
	if (cval.val != 0)
		FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE);

	if (!reconfig) {
		/*
		 * Ignore if the user tries to change the file size.  The
		 * amount of memory allocated to the log slots may be based
		 * on the log file size at creation and we don't want to
		 * re-allocate that memory while running.
		 */
		WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
		conn->log_file_max = (wt_off_t)cval.val;
		WT_STAT_FAST_CONN_SET(session,
		    log_max_filesize, conn->log_file_max);
	}

	/*
	 * If pre-allocation is configured, set the initial number to a few.
	 * We'll adapt as load dictates.
	 */
	WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval));
	if (cval.val != 0)
		conn->log_prealloc = 1;

	/*
	 * Note that it is meaningless to reconfigure this value during
	 * runtime.  It only matters on create before recovery runs.
	 */
	WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval));
	if (cval.len != 0  && WT_STRING_MATCH("error", cval.str, cval.len))
		FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR);

	WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval));
	if (cval.val != 0) {
		if (F_ISSET(conn, WT_CONN_READONLY))
			WT_RET_MSG(session, EINVAL,
			    "Read-only configuration incompatible with "
			    "zero-filling log files");
		FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL);
	}

	WT_RET(__logmgr_sync_cfg(session, cfg));
	if (conn->log_cond != NULL)
		WT_RET(__wt_cond_auto_signal(session, conn->log_cond));
	return (0);
}
Exemplo n.º 8
0
Arquivo: txn.c Projeto: To4e/mongo
/*
 * __wt_txn_commit --
 *	Commit the current transaction.
 */
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_OP *op;
	u_int i;

	txn = &session->txn;
	conn = S2C(session);
	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);

	if (!F_ISSET(txn, WT_TXN_RUNNING))
		WT_RET_MSG(session, EINVAL, "No transaction is active");

	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 */
	WT_RET(__wt_config_gets_def(session, cfg, "sync", 0, &cval));

	/*
	 * If the user chose the default setting, check whether sync is enabled
	 * for this transaction (either inherited or via begin_transaction).
	 * If sync is disabled, clear the field to avoid the log write being
	 * flushed.
	 *
	 * Otherwise check for specific settings.  We don't need to check for
	 * "on" because that is the default inherited from the connection.  If
	 * the user set anything in begin_transaction, we only override with an
	 * explicit setting.
	 */
	if (cval.len == 0) {
		if (!FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH) &&
		    !F_ISSET(txn, WT_TXN_SYNC_SET))
			txn->txn_logsync = 0;
	} else {
		/*
		 * If the caller already set sync on begin_transaction then
		 * they should not be using sync on commit_transaction.
		 * Flag that as an error.
		 */
		if (F_ISSET(txn, WT_TXN_SYNC_SET))
			WT_RET_MSG(session, EINVAL,
			    "Sync already set during begin_transaction.");
		if (WT_STRING_MATCH("background", cval.str, cval.len))
			txn->txn_logsync = WT_LOG_BACKGROUND;
		else if (WT_STRING_MATCH("off", cval.str, cval.len))
			txn->txn_logsync = 0;
		/*
		 * We don't need to check for "on" here because that is the
		 * default to inherit from the connection setting.
		 */
	}

	/* Commit notification. */
	if (txn->notify != NULL)
		WT_TRET(txn->notify->notify(txn->notify,
		    (WT_SESSION *)session, txn->id, 1));

	/* If we are logging, write a commit log record. */
	if (ret == 0 && txn->mod_count > 0 &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
		/*
		 * We are about to block on I/O writing the log.
		 * Release our snapshot in case it is keeping data pinned.
		 * This is particularly important for checkpoints.
		 */
		__wt_txn_release_snapshot(session);
		ret = __wt_txn_log_commit(session, cfg);
		WT_ASSERT(session, ret == 0);
	}

	/*
	 * If anything went wrong, roll back.
	 *
	 * !!!
	 * Nothing can fail after this point.
	 */
	if (ret != 0) {
		WT_TRET(__wt_txn_rollback(session, cfg));
		return (ret);
	}

	/* Free memory associated with updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
		__wt_txn_op_free(session, op);
	txn->mod_count = 0;

	/*
	 * We are about to release the snapshot: copy values into any
	 * positioned cursors so they don't point to updates that could be
	 * freed once we don't have a transaction ID pinned.
	 */
	if (session->ncursors > 0)
		WT_RET(__wt_session_copy_values(session));

	__wt_txn_release(session);
	return (0);
}
Exemplo n.º 9
0
/*
 * __wt_curfile_open --
 *	WT_SESSION->open_cursor method for the btree cursor type.
 */
int
__wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
    WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CONFIG_ITEM cval;
	WT_DECL_RET;
	uint32_t flags;
	bool bitmap, bulk, checkpoint_wait;

	bitmap = bulk = false;
	checkpoint_wait = true;
	flags = 0;

	/*
	 * Decode the bulk configuration settings. In memory databases
	 * ignore bulk load.
	 */
	if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) {
		WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
		if (cval.type == WT_CONFIG_ITEM_BOOL ||
		    (cval.type == WT_CONFIG_ITEM_NUM &&
		    (cval.val == 0 || cval.val == 1))) {
			bitmap = false;
			bulk = cval.val != 0;
		} else if (WT_STRING_MATCH("bitmap", cval.str, cval.len))
			bitmap = bulk = true;
			/*
			 * Unordered bulk insert is a special case used
			 * internally by index creation on existing tables. It
			 * doesn't enforce any special semantics at the file
			 * level. It primarily exists to avoid some locking
			 * problems between LSM and index creation.
			 */
		else if (!WT_STRING_MATCH("unordered", cval.str, cval.len))
			WT_RET_MSG(session, EINVAL,
			    "Value for 'bulk' must be a boolean or 'bitmap'");

		if (bulk) {
			WT_RET(__wt_config_gets(session,
			    cfg, "checkpoint_wait", &cval));
			checkpoint_wait = cval.val != 0;
		}
	}

	/* Bulk handles require exclusive access. */
	if (bulk)
		LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE);

	/* Get the handle and lock it while the cursor is using it. */
	if (WT_PREFIX_MATCH(uri, "file:")) {
		/*
		 * If we are opening exclusive and don't want a bulk cursor
		 * open to fail with EBUSY due to a database-wide checkpoint,
		 * get the handle while holding the checkpoint lock.
		 */
		if (LF_ISSET(WT_DHANDLE_EXCLUSIVE) && checkpoint_wait)
			WT_WITH_CHECKPOINT_LOCK(session,
			    ret = __wt_session_get_btree_ckpt(
			    session, uri, cfg, flags));
		else
			ret = __wt_session_get_btree_ckpt(
			    session, uri, cfg, flags);
		WT_RET(ret);
	} else
		WT_RET(__wt_bad_object_type(session, uri));

	WT_ERR(__curfile_create(session, owner, cfg, bulk, bitmap, cursorp));

	return (0);

err:	/* If the cursor could not be opened, release the handle. */
	WT_TRET(__wt_session_release_btree(session));
	return (ret);
}
Exemplo n.º 10
0
/*
 * __curfile_create --
 *	Open a cursor for a given btree handle.
 */
static int
__curfile_create(WT_SESSION_IMPL *session,
    WT_CURSOR *owner, const char *cfg[], bool bulk, bool bitmap,
    WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __curfile_compare,			/* compare */
	    __curfile_equals,			/* equals */
	    __curfile_next,			/* next */
	    __curfile_prev,			/* prev */
	    __curfile_reset,			/* reset */
	    __curfile_search,			/* search */
	    __curfile_search_near,		/* search-near */
	    __curfile_insert,			/* insert */
	    __wt_cursor_modify_notsup,		/* modify */
	    __curfile_update,			/* update */
	    __curfile_remove,			/* remove */
	    __curfile_reserve,			/* reserve */
	    __wt_cursor_reconfigure,		/* reconfigure */
	    __curfile_close);			/* close */
	WT_BTREE *btree;
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE *cbt;
	WT_CURSOR_BULK *cbulk;
	WT_DECL_RET;
	size_t csize;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);

	cbt = NULL;

	btree = S2BT(session);
	WT_ASSERT(session, btree != NULL);

	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
	WT_RET(__wt_calloc(session, 1, csize, &cbt));

	cursor = &cbt->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->internal_uri = btree->dhandle->name;
	cursor->key_format = btree->key_format;
	cursor->value_format = btree->value_format;
	cbt->btree = btree;

	/*
	 * Increment the data-source's in-use counter; done now because closing
	 * the cursor will decrement it, and all failure paths from here close
	 * the cursor.
	 */
	__wt_cursor_dhandle_incr_use(session);

	if (session->dhandle->checkpoint != NULL)
		F_SET(cbt, WT_CBT_NO_TXN);

	if (bulk) {
		F_SET(cursor, WT_CURSTD_BULK);

		cbulk = (WT_CURSOR_BULK *)cbt;

		/* Optionally skip the validation of each bulk-loaded key. */
		WT_ERR(__wt_config_gets_def(
		    session, cfg, "skip_sort_check", 0, &cval));
		WT_ERR(__wt_curbulk_init(
		    session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
	}

	/*
	 * Random retrieval, row-store only.
	 * Random retrieval cursors support a limited set of methods.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
	if (cval.val != 0) {
		if (WT_CURSOR_RECNO(cursor))
			WT_ERR_MSG(session, ENOTSUP,
			    "next_random configuration not supported for "
			    "column-store objects");

		__wt_cursor_set_notsup(cursor);
		cursor->next = __wt_curfile_next_random;
		cursor->reset = __curfile_reset;

		WT_ERR(__wt_config_gets_def(
		    session, cfg, "next_random_sample_size", 0, &cval));
		if (cval.val != 0)
			cbt->next_random_sample_size = (u_int)cval.val;
	}

	/* Underlying btree initialization. */
	__wt_btcur_open(cbt);

	/*
	 * WT_CURSOR.modify supported on 'u' value formats, but the fast-path
	 * through the btree code requires log file format changes, it's not
	 * available in all versions.
	 */
	if (WT_STREQ(cursor->value_format, "u") &&
	    S2C(session)->compat_major >= WT_LOG_V2)
		cursor->modify = __curfile_modify;

	WT_ERR(__wt_cursor_init(
	    cursor, cursor->internal_uri, owner, cfg, cursorp));

	WT_STAT_CONN_INCR(session, cursor_create);
	WT_STAT_DATA_INCR(session, cursor_create);

	if (0) {
err:		/*
		 * Our caller expects to release the data handle if we fail.
		 * Disconnect it from the cursor before closing.
		 */
		if (session->dhandle != NULL)
			__wt_cursor_dhandle_decr_use(session);
		cbt->btree = NULL;
		WT_TRET(__curfile_close(cursor));
		*cursorp = NULL;
	}

	return (ret);
}
Exemplo n.º 11
0
/*
 * __logmgr_config --
 *	Parse and setup the logging server options.
 */
static int
__logmgr_config(
    WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool reconfig)
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	bool enabled;

	/*
	 * A note on reconfiguration: the standard "is this configuration string
	 * allowed" checks should fail if reconfiguration has invalid strings,
	 * for example, "log=(enabled)", or "statistics_log=(path=XXX)", because
	 * the connection reconfiguration method doesn't allow those strings.
	 * Additionally, the base configuration values during reconfiguration
	 * are the currently configured values (so we don't revert to default
	 * values when repeatedly reconfiguring), and configuration processing
	 * of a currently set value should not change the currently set value.
	 *
	 * In this code path, log server reconfiguration does not stop/restart
	 * the log server, so there's no point in re-evaluating configuration
	 * strings that cannot be reconfigured, risking bugs in configuration
	 * setup, and depending on evaluation of currently set values to always
	 * result in the currently set value. Skip tests for any configuration
	 * strings which don't make sense during reconfiguration, but don't
	 * worry about error reporting because it should never happen.
	 */

	conn = S2C(session);

	WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
	enabled = cval.val != 0;

	/*
	 * If we're reconfiguring, enabled must match the already
	 * existing setting.
	 *
	 * If it is off and the user it turning it on, or it is on
	 * and the user is turning it off, return an error.
	 *
	 * See above: should never happen.
	 */
	if (reconfig &&
	    ((enabled && !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) ||
	    (!enabled && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))))
		return (EINVAL);

	/* Logging is incompatible with in-memory */
	if (enabled) {
		WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
		if (cval.val != 0)
			WT_RET_MSG(session, EINVAL,
			    "In memory configuration incompatible with "
			    "log=(enabled=true)");
	}

	*runp = enabled;

	/*
	 * Setup a log path and compression even if logging is disabled in case
	 * we are going to print a log.  Only do this on creation.  Once a
	 * compressor or log path are set they cannot be changed.
	 *
	 * See above: should never happen.
	 */
	if (!reconfig) {
		conn->log_compressor = NULL;
		WT_RET(__wt_config_gets_none(
		    session, cfg, "log.compressor", &cval));
		WT_RET(__wt_compressor_config(
		    session, &cval, &conn->log_compressor));

		WT_RET(__wt_config_gets(session, cfg, "log.path", &cval));
		WT_RET(__wt_strndup(
		    session, cval.str, cval.len, &conn->log_path));
	}

	/* We are done if logging isn't enabled. */
	if (!*runp)
		return (0);

	WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval));
	if (cval.val != 0)
		FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE);

	/*
	 * The file size cannot be reconfigured. The amount of memory allocated
	 * to the log slots may be based on the log file size at creation and we
	 * don't want to re-allocate that memory while running.
	 *
	 * See above: should never happen.
	 */
	if (!reconfig) {
		WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
		conn->log_file_max = (wt_off_t)cval.val;
		WT_STAT_FAST_CONN_SET(session,
		    log_max_filesize, conn->log_file_max);
	}

	/*
	 * If pre-allocation is configured, set the initial number to a few.
	 * We'll adapt as load dictates.
	 */
	WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval));
	if (cval.val != 0)
		conn->log_prealloc = 1;

	/*
	 * Note it's meaningless to reconfigure this value during runtime, it
	 * only matters on create before recovery runs.
	 *
	 * See above: should never happen.
	 */
	if (!reconfig) {
		WT_RET(__wt_config_gets_def(
		    session, cfg, "log.recover", 0, &cval));
		if (WT_STRING_MATCH("error", cval.str, cval.len))
			FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR);
	}

	WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval));
	if (cval.val != 0) {
		if (F_ISSET(conn, WT_CONN_READONLY))
			WT_RET_MSG(session, EINVAL,
			    "Read-only configuration incompatible with "
			    "zero-filling log files");
		FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL);
	}

	WT_RET(__logmgr_sync_cfg(session, cfg));
	if (conn->log_cond != NULL)
		WT_RET(__wt_cond_auto_signal(session, conn->log_cond));
	return (0);
}
Exemplo n.º 12
0
/*
 * __wt_txn_commit --
 *	Commit the current transaction.
 */
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_OP *op;
	u_int i;
	bool locked, readonly;
#ifdef HAVE_TIMESTAMPS
	wt_timestamp_t prev_commit_timestamp, ts;
	bool update_timestamp;
#endif

	txn = &session->txn;
	conn = S2C(session);
	txn_global = &conn->txn_global;
	locked = false;

	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) ||
	    txn->mod_count == 0);

	readonly = txn->mod_count == 0;
	/*
	 * Look for a commit timestamp.
	 */
	WT_ERR(
	    __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval));
	if (cval.len != 0) {
#ifdef HAVE_TIMESTAMPS
		WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
		WT_ERR(__wt_timestamp_validate(session,
		    "commit", &ts, &cval, true, true, true));
		__wt_timestamp_set(&txn->commit_timestamp, &ts);
		__wt_txn_set_commit_timestamp(session);
#else
		WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a "
		    "version of WiredTiger built with timestamp support");
#endif
	}

#ifdef HAVE_TIMESTAMPS
	/*
	 * Debugging checks on timestamps, if user requested them.
	 */
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) &&
	    !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "commit_timestamp required and "
		    "none set on this transaction");
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) &&
	    F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and "
		    "timestamp set on this transaction");
#endif
	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));

	/*
	 * If the user chose the default setting, check whether sync is enabled
	 * for this transaction (either inherited or via begin_transaction).
	 * If sync is disabled, clear the field to avoid the log write being
	 * flushed.
	 *
	 * Otherwise check for specific settings.  We don't need to check for
	 * "on" because that is the default inherited from the connection.  If
	 * the user set anything in begin_transaction, we only override with an
	 * explicit setting.
	 */
	if (cval.len == 0) {
		if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
		    !F_ISSET(txn, WT_TXN_SYNC_SET))
			txn->txn_logsync = 0;
	} else {
		/*
		 * If the caller already set sync on begin_transaction then
		 * they should not be using sync on commit_transaction.
		 * Flag that as an error.
		 */
		if (F_ISSET(txn, WT_TXN_SYNC_SET))
			WT_ERR_MSG(session, EINVAL,
			    "Sync already set during begin_transaction");
		if (WT_STRING_MATCH("background", cval.str, cval.len))
			txn->txn_logsync = WT_LOG_BACKGROUND;
		else if (WT_STRING_MATCH("off", cval.str, cval.len))
			txn->txn_logsync = 0;
		/*
		 * We don't need to check for "on" here because that is the
		 * default to inherit from the connection setting.
		 */
	}

	/* Commit notification. */
	if (txn->notify != NULL)
		WT_ERR(txn->notify->notify(txn->notify,
		    (WT_SESSION *)session, txn->id, 1));

	/*
	 * We are about to release the snapshot: copy values into any
	 * positioned cursors so they don't point to updates that could be
	 * freed once we don't have a snapshot.
	 */
	if (session->ncursors > 0) {
		WT_DIAGNOSTIC_YIELD;
		WT_ERR(__wt_session_copy_values(session));
	}

	/* If we are logging, write a commit log record. */
	if (txn->logrec != NULL &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
		/*
		 * We are about to block on I/O writing the log.
		 * Release our snapshot in case it is keeping data pinned.
		 * This is particularly important for checkpoints.
		 */
		__wt_txn_release_snapshot(session);
		/*
		 * We hold the visibility lock for reading from the time
		 * we write our log record until the time we release our
		 * transaction so that the LSN any checkpoint gets will
		 * always reflect visible data.
		 */
		__wt_readlock(session, &txn_global->visibility_rwlock);
		locked = true;
		WT_ERR(__wt_txn_log_commit(session, cfg));
	}

	/* Note: we're going to commit: nothing can fail after this point. */

	/* Process and free updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
		switch (op->type) {
		case WT_TXN_OP_BASIC:
		case WT_TXN_OP_BASIC_TS:
		case WT_TXN_OP_INMEM:
			/*
			 * Switch reserved operations to abort to
			 * simplify obsolete update list truncation.
			 */
			if (op->u.upd->type == WT_UPDATE_RESERVED) {
				op->u.upd->txnid = WT_TXN_ABORTED;
				break;
			}

			/*
			 * Writes to the lookaside file can be evicted as soon
			 * as they commit.
			 */
			if (conn->cache->las_fileid != 0 &&
			    op->fileid == conn->cache->las_fileid) {
				op->u.upd->txnid = WT_TXN_NONE;
				break;
			}

#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
			    op->type != WT_TXN_OP_BASIC_TS) {
				WT_ASSERT(session,
				    op->fileid != WT_METAFILE_ID);
				__wt_timestamp_set(&op->u.upd->timestamp,
				    &txn->commit_timestamp);
			}
#endif
			break;

		case WT_TXN_OP_REF:
#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
				__wt_timestamp_set(
				    &op->u.ref->page_del->timestamp,
				    &txn->commit_timestamp);
#endif
			break;

		case WT_TXN_OP_TRUNCATE_COL:
		case WT_TXN_OP_TRUNCATE_ROW:
			/* Other operations don't need timestamps. */
			break;
		}

		__wt_txn_op_free(session, op);
	}
	txn->mod_count = 0;

#ifdef HAVE_TIMESTAMPS
	/*
	 * Track the largest commit timestamp we have seen.
	 *
	 * We don't actually clear the local commit timestamp, just the flag.
	 * That said, we can't update the global commit timestamp until this
	 * transaction is visible, which happens when we release it.
	 */
	update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
#endif

	__wt_txn_release(session);
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);

#ifdef HAVE_TIMESTAMPS
	/* First check if we've already committed something in the future. */
	if (update_timestamp) {
		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp));
		update_timestamp = __wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0;
	}

	/*
	 * If it looks like we need to move the global commit timestamp,
	 * write lock and re-check.
	 */
	if (update_timestamp) {
#if WT_TIMESTAMP_SIZE == 8
		while (__wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0) {
			if (__wt_atomic_cas64(
			    &txn_global->commit_timestamp.val,
			    prev_commit_timestamp.val,
			    txn->commit_timestamp.val)) {
				txn_global->has_commit_timestamp = true;
				break;
			}
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp);
		}
#else
		__wt_writelock(session, &txn_global->rwlock);
		if (__wt_timestamp_cmp(&txn->commit_timestamp,
		    &txn_global->commit_timestamp) > 0) {
			__wt_timestamp_set(&txn_global->commit_timestamp,
			    &txn->commit_timestamp);
			txn_global->has_commit_timestamp = true;
		}
		__wt_writeunlock(session, &txn_global->rwlock);
#endif
	}
#endif

	/*
	 * We're between transactions, if we need to block for eviction, it's
	 * a good time to do so.  Note that we must ignore any error return
	 * because the user's data is committed.
	 */
	if (!readonly)
		(void)__wt_cache_eviction_check(session, false, false, NULL);
	return (0);

err:	/*
	 * If anything went wrong, roll back.
	 *
	 * !!!
	 * Nothing can fail after this point.
	 */
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);
	WT_TRET(__wt_txn_rollback(session, cfg));
	return (ret);
}
Exemplo n.º 13
0
/*
 * __wt_txn_config --
 *	Configure a transaction.
 */
int
__wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_TXN *txn;

	txn = &session->txn;

	WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
	if (cval.len != 0)
		txn->isolation =
		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
		    WT_ISO_SNAPSHOT :
		    WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
		    WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED;

	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 *
	 * We want to distinguish between inheriting implicitly and explicitly.
	 */
	F_CLR(txn, WT_TXN_SYNC_SET);
	WT_RET(__wt_config_gets_def(
	    session, cfg, "sync", (int)UINT_MAX, &cval));
	if (cval.val == 0 || cval.val == 1)
		/*
		 * This is an explicit setting of sync.  Set the flag so
		 * that we know not to overwrite it in commit_transaction.
		 */
		F_SET(txn, WT_TXN_SYNC_SET);

	/*
	 * If sync is turned off explicitly, clear the transaction's sync field.
	 */
	if (cval.val == 0)
		txn->txn_logsync = 0;

	WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval));
	if (cval.len > 0)
		/*
		 * The layering here isn't ideal - the named snapshot get
		 * function does both validation and setup. Otherwise we'd
		 * need to walk the list of named snapshots twice during
		 * transaction open.
		 */
		WT_RET(__wt_txn_named_snapshot_get(session, &cval));

	WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval));
	if (cval.len > 0) {
#ifdef HAVE_TIMESTAMPS
		wt_timestamp_t ts;
		WT_TXN_GLOBAL *txn_global;
		char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1];
		bool round_to_oldest;

		txn_global = &S2C(session)->txn_global;
		WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval));

		/*
		 * Read the configuration here to reduce the span of the
		 * critical section.
		 */
		WT_RET(__wt_config_gets_def(session,
		    cfg, "round_to_oldest", 0, &cval));
		round_to_oldest = cval.val;
		/*
		 * This code is not using the timestamp validate function to
		 * avoid a race between checking and setting transaction
		 * timestamp.
		 */
		__wt_readlock(session, &txn_global->rwlock);
		if (__wt_timestamp_cmp(&ts, &txn_global->oldest_timestamp) < 0)
		{
			WT_RET(__wt_timestamp_to_hex_string(session,
			    timestamp_buf, &ts));
			/*
			 * If given read timestamp is earlier than oldest
			 * timestamp then round the read timestamp to
			 * oldest timestamp.
			 */
			if (round_to_oldest)
				__wt_timestamp_set(&txn->read_timestamp,
				    &txn_global->oldest_timestamp);
			else {
				__wt_readunlock(session, &txn_global->rwlock);
				WT_RET_MSG(session, EINVAL, "read timestamp "
				    "%s older than oldest timestamp",
				    timestamp_buf);
			}
		} else {
			__wt_timestamp_set(&txn->read_timestamp, &ts);
			/*
			 * Reset to avoid a verbose message as read
			 * timestamp is not rounded to oldest timestamp.
			 */
			round_to_oldest = false;
		}

		__wt_txn_set_read_timestamp(session);
		__wt_readunlock(session, &txn_global->rwlock);
		txn->isolation = WT_ISO_SNAPSHOT;
		if (round_to_oldest) {
			/*
			 * This message is generated here to reduce the span of
			 * critical section.
			 */
			__wt_verbose(session, WT_VERB_TIMESTAMP, "Read "
			    "timestamp %s : Rounded to oldest timestamp",
			    timestamp_buf);
		}
#else
		WT_RET_MSG(session, EINVAL, "read_timestamp requires a "
		    "version of WiredTiger built with timestamp support");
#endif
	}

	return (0);
}
Exemplo n.º 14
0
/*
 * __wt_curmetadata_open --
 *	WT_SESSION->open_cursor method for metadata cursors.
 *
 * Metadata cursors are a similar to a file cursor on the special metadata
 * table, except that the metadata for the metadata table (which is stored
 * in the turtle file) can also be queried.
 *
 * Metadata cursors are read-only by default.
 */
int
__wt_curmetadata_open(WT_SESSION_IMPL *session,
    const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __curmetadata_compare,		/* compare */
	    __wt_cursor_equals,			/* equals */
	    __curmetadata_next,			/* next */
	    __curmetadata_prev,			/* prev */
	    __curmetadata_reset,		/* reset */
	    __curmetadata_search,		/* search */
	    __curmetadata_search_near,		/* search-near */
	    __curmetadata_insert,		/* insert */
	    __curmetadata_update,		/* update */
	    __curmetadata_remove,		/* remove */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __curmetadata_close);		/* close */
	WT_CURSOR *cursor;
	WT_CURSOR_METADATA *mdc;
	WT_DECL_RET;
	WT_CONFIG_ITEM cval;

	WT_RET(__wt_calloc_one(session, &mdc));

	cursor = &mdc->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->key_format = "S";
	cursor->value_format = "S";

	/*
	 * Open the file cursor for operations on the regular metadata; don't
	 * use the existing, cached session metadata cursor, the configuration
	 * may not be the same.
	 */
	WT_ERR(__wt_metadata_cursor_open(session, cfg[1], &mdc->file_cursor));

	WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));

	/* If we are only returning create config, strip internal metadata. */
	if (WT_STREQ(uri, "metadata:create"))
		F_SET(mdc, WT_MDC_CREATEONLY);

	/*
	 * Metadata cursors default to readonly; if not set to not-readonly,
	 * they are permanently readonly and cannot be reconfigured.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "readonly", 1, &cval));
	if (cval.val != 0) {
		cursor->insert = __wt_cursor_notsup;
		cursor->update = __wt_cursor_notsup;
		cursor->remove = __wt_cursor_notsup;
	}

	if (0) {
err:		WT_TRET(__curmetadata_close(cursor));
		*cursorp = NULL;
	}
	return (ret);
}