예제 #1
0
/*
 * __recovery_file_scan --
 *	Scan the files referenced from the metadata and gather information
 *	about them for recovery.
 */
static int
__recovery_file_scan(WT_RECOVERY *r)
{
	WT_DECL_RET;
	WT_CURSOR *c;
	const char *uri, *config;
	int cmp;

	/* Scan through all files in the metadata. */
	c = r->files[0].c;
	c->set_key(c, "file:");
	if ((ret = c->search_near(c, &cmp)) != 0) {
		/* Is the metadata empty? */
		if (ret == WT_NOTFOUND)
			ret = 0;
		goto err;
	}
	if (cmp < 0)
		WT_ERR_NOTFOUND_OK(c->next(c));
	for (; ret == 0; ret = c->next(c)) {
		WT_ERR(c->get_key(c, &uri));
		if (!WT_PREFIX_MATCH(uri, "file:"))
			break;
		WT_ERR(c->get_value(c, &config));
		WT_ERR(__recovery_setup_file(r, uri, config));
	}
	WT_ERR_NOTFOUND_OK(ret);

err:	return (ret);
}
예제 #2
0
파일: cur_backup.c 프로젝트: mdkhaled/mongo
/*
 * __backup_all --
 *	Backup all objects in the database.
 */
static int
__backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
{
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	const char *key, *value;

	cursor = NULL;

	/*
	 * Open a cursor on the metadata file and copy all of the entries to
	 * the hot backup file.
	 */
	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
	while ((ret = cursor->next(cursor)) == 0) {
		WT_ERR(cursor->get_key(cursor, &key));
		WT_ERR(cursor->get_value(cursor, &value));
		WT_ERR_TEST((fprintf(
		    cb->bfp, "%s\n%s\n", key, value) < 0), __wt_errno());

		/*
		 * While reading the metadata file, check there are no "sources"
		 * or "types" which can't support hot backup.  This checks for
		 * a data source that's non-standard, which can't be backed up,
		 * but is also sanity checking: if there's an entry backed by
		 * anything other than a file or lsm entry, we're confused.
		 */
		if ((ret = __wt_config_getones(
		    session, value, "type", &cval)) == 0 &&
		    !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file") &&
		    !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm"))
			WT_ERR_MSG(session, ENOTSUP,
			    "hot backup is not supported for objects of "
			    "type %.*s", (int)cval.len, cval.str);
		WT_ERR_NOTFOUND_OK(ret);
		if ((ret =__wt_config_getones(
		    session, value, "source", &cval)) == 0 &&
		    !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file:") &&
		    !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm:"))
			WT_ERR_MSG(session, ENOTSUP,
			    "hot backup is not supported for objects of "
			    "source %.*s", (int)cval.len, cval.str);
		WT_ERR_NOTFOUND_OK(ret);
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* Build a list of the file objects that need to be copied. */
	WT_WITH_DHANDLE_LOCK(session,
	    ret = __wt_meta_btree_apply(
	    session, __backup_list_all_append, NULL));

err:	if (cursor != NULL)
		WT_TRET(cursor->close(cursor));
	return (ret);
}
예제 #3
0
/*
 * __recovery_set_checkpoint_timestamp --
 *	Set the checkpoint timestamp as retrieved from the metadata file.
 */
static int
__recovery_set_checkpoint_timestamp(WT_RECOVERY *r)
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	wt_timestamp_t ckpt_timestamp;
	char ts_string[WT_TS_INT_STRING_SIZE], *sys_config;

	sys_config = NULL;

	session = r->session;
	conn = S2C(session);
	/*
	 * Read the system checkpoint information from the metadata file and
	 * save the stable timestamp of the last checkpoint for later query.
	 * This gets saved in the connection.
	 */
	ckpt_timestamp = 0;

	/* Search in the metadata for the system information. */
	WT_ERR_NOTFOUND_OK(
	    __wt_metadata_search(session, WT_SYSTEM_CKPT_URI, &sys_config));
	if (sys_config != NULL) {
		WT_CLEAR(cval);
		WT_ERR_NOTFOUND_OK(__wt_config_getones(
		    session, sys_config, "checkpoint_timestamp", &cval));
		if (cval.len != 0) {
			__wt_verbose(session, WT_VERB_RECOVERY,
			    "Recovery timestamp %.*s",
			    (int)cval.len, cval.str);
			WT_ERR(__wt_txn_parse_timestamp_raw(session,
			    "recovery", &ckpt_timestamp, &cval));
		}
	}

	/*
	 * Set the recovery checkpoint timestamp and the metadata checkpoint
	 * timestamp so that the checkpoint after recovery writes the correct
	 * value into the metadata.
	 */
	conn->txn_global.meta_ckpt_timestamp =
	    conn->txn_global.recovery_timestamp = ckpt_timestamp;

	if (WT_VERBOSE_ISSET(session,
	    WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS)) {
		__wt_timestamp_to_string(
		    conn->txn_global.recovery_timestamp, ts_string);
		__wt_verbose(session,
		    WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS,
		    "Set global recovery timestamp: %s", ts_string);
	}
err:	__wt_free(session, sys_config);
	return (ret);
}
예제 #4
0
/*
 * __backup_uri --
 *	Backup a list of objects.
 */
static int
__backup_uri(WT_SESSION_IMPL *session,
             WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp, int *log_only)
{
    WT_CONFIG targetconf;
    WT_CONFIG_ITEM cval, k, v;
    WT_DECL_ITEM(tmp);
    WT_DECL_RET;
    int target_list;
    const char *uri;

    *foundp = 0;
    *log_only = 0;

    /*
     * If we find a non-empty target configuration string, we have a job,
     * otherwise it's not our problem.
     */
    WT_RET(__wt_config_gets(session, cfg, "target", &cval));
    WT_RET(__wt_config_subinit(session, &targetconf, &cval));
    for (cb->list_next = 0, target_list = 0;
            (ret = __wt_config_next(&targetconf, &k, &v)) == 0; ++target_list) {
        /* If it is our first time through, allocate. */
        if (target_list == 0) {
            *foundp = 1;
            WT_ERR(__wt_scr_alloc(session, 512, &tmp));
        }

        WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
        uri = tmp->data;
        if (v.len != 0)
            WT_ERR_MSG(session, EINVAL,
                       "%s: invalid backup target: URIs may need quoting",
                       uri);

        /*
         * Handle log targets.  We do not need to go through the
         * schema worker, just call the function to append them.
         * Set log_only only if it is our only URI target.
         */
        if (WT_PREFIX_MATCH(uri, "log:")) {
            if (target_list == 0)
                *log_only = 1;
            else
                *log_only = 0;
            WT_ERR(__wt_backup_list_uri_append(
                       session, uri, NULL));
        } else
            WT_ERR(__wt_schema_worker(session,
                                      uri, NULL, __wt_backup_list_uri_append, cfg, 0));
    }
    WT_ERR_NOTFOUND_OK(ret);

err:
    __wt_scr_free(session, &tmp);
    return (ret);
}
예제 #5
0
파일: bt_read.c 프로젝트: qihsh/mongo
/*
 * __wt_las_remove_block --
 *	Remove all records matching a key prefix from the lookaside store.
 */
int
__wt_las_remove_block(WT_SESSION_IMPL *session,
    WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size)
{
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_RET;
	uint64_t las_counter, las_txnid;
	uint32_t las_id;
	int exact;

	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));

	/*
	 * Search for the block's unique prefix and step through all matching
	 * records, removing them.
	 */
	las_addr->data = addr;
	las_addr->size = addr_size;
	las_key->size = 0;
	cursor->set_key(
	    cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
	if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
		ret = cursor->next(cursor);
	for (; ret == 0; ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * Confirm the search using the unique prefix; if not a match,
		 * we're done searching for records for this page.
		 */
		 if (las_id != btree_id ||
		     las_addr->size != addr_size ||
		     memcmp(las_addr->data, addr, addr_size) != 0)
			break;

		/*
		 * Cursor opened overwrite=true: won't return WT_NOTFOUND should
		 * another thread remove the record before we do, and the cursor
		 * remains positioned in that case.
		 */
		WT_ERR(cursor->remove(cursor));
	}
	WT_ERR_NOTFOUND_OK(ret);

err:	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);
	return (ret);
}
예제 #6
0
/*
 * __clsm_next --
 *	WT_CURSOR->next method for the LSM cursor type.
 */
static int
__clsm_next(WT_CURSOR *cursor)
{
	WT_CURSOR_LSM *clsm;
	WT_CURSOR *c;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	u_int i;
	int cmp;
	bool check, deleted;

	clsm = (WT_CURSOR_LSM *)cursor;

	CURSOR_API_CALL(cursor, session, next, NULL);
	WT_CURSOR_NOVALUE(cursor);
	WT_ERR(__clsm_enter(clsm, false, false));

	/* If we aren't positioned for a forward scan, get started. */
	if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) {
		F_CLR(clsm, WT_CLSM_MULTIPLE);
		WT_FORALL_CURSORS(clsm, c, i) {
			if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
				WT_ERR(c->reset(c));
				ret = c->next(c);
			} else if (c != clsm->current) {
				c->set_key(c, &cursor->key);
				if ((ret = c->search_near(c, &cmp)) == 0) {
					if (cmp < 0)
						ret = c->next(c);
					else if (cmp == 0) {
						if (clsm->current == NULL)
							clsm->current = c;
						else
							F_SET(clsm,
							    WT_CLSM_MULTIPLE);
					}
				} else
					F_CLR(c, WT_CURSTD_KEY_SET);
			}
			WT_ERR_NOTFOUND_OK(ret);
		}
		F_SET(clsm, WT_CLSM_ITERATE_NEXT);
		F_CLR(clsm, WT_CLSM_ITERATE_PREV);

		/* We just positioned *at* the key, now move. */
		if (clsm->current != NULL)
			goto retry;
	} else {
예제 #7
0
/*
 * __truncate_dsrc --
 *	WT_SESSION::truncate for a data-source without a truncate operation.
 */
static int
__truncate_dsrc(WT_SESSION_IMPL *session, const char *uri)
{
	WT_CURSOR *cursor;
	WT_DECL_RET;
	const char *cfg[2];

	/* Open a cursor and traverse the object, removing every entry. */
	cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
	cfg[1] = NULL;
	WT_RET(__wt_open_cursor(session, uri, NULL, cfg, &cursor));
	while ((ret = cursor->next(cursor)) == 0)
		WT_ERR(cursor->remove(cursor));
	WT_ERR_NOTFOUND_OK(ret);

err:	WT_TRET(cursor->close(cursor));
	return (ret);
}
예제 #8
0
/*
 * __backup_uri --
 *	Backup a list of objects.
 */
static int
__backup_uri(WT_SESSION_IMPL *session,
    WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp)
{
	WT_CONFIG targetconf;
	WT_CONFIG_ITEM cval, k, v;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	int target_list;
	const char *uri;

	*foundp = target_list = 0;

	/*
	 * If we find a non-empty target configuration string, we have a job,
	 * otherwise it's not our problem.
	 */
	WT_RET(__wt_config_gets(session, cfg, "target", &cval));
	WT_RET(__wt_config_subinit(session, &targetconf, &cval));
	for (cb->list_next = 0;
	    (ret = __wt_config_next(&targetconf, &k, &v)) == 0;) {
		if (!target_list) {
			target_list = *foundp = 1;

			WT_ERR(__wt_scr_alloc(session, 512, &tmp));
		}

		WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
		uri = tmp->data;
		if (v.len != 0)
			WT_ERR_MSG(session, EINVAL,
			    "%s: invalid backup target: URIs may need quoting",
			    uri);

		WT_ERR(__wt_schema_worker(
		    session, uri, NULL, __wt_backup_list_uri_append, cfg, 0));
	}
	WT_ERR_NOTFOUND_OK(ret);

err:	__wt_scr_free(&tmp);
	return (ret);
}
예제 #9
0
/*
 * __conn_reconfigure --
 *	WT_CONNECTION->reconfigure method.
 */
static int
__conn_reconfigure(WT_CONNECTION *wt_conn, const char *config)
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	/*
	 * Special version of cfg that doesn't include the default config: used
	 * to limit changes to values that the application sets explicitly.
	 * Note that any function using this value has to be prepared to handle
	 * not-found as a valid option return.
	 */
	const char *raw_cfg[] = { config, NULL };

	conn = (WT_CONNECTION_IMPL *)wt_conn;

	CONNECTION_API_CALL(conn, session, reconfigure, config, cfg);

	/* Turning on statistics clears any existing values. */
	if ((ret =
	    __wt_config_gets(session, raw_cfg, "statistics", &cval)) == 0) {
		conn->statistics = cval.val == 0 ? 0 : 1;
		if (conn->statistics)
			__wt_stat_clear_connection_stats(&conn->stats);
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_conn_cache_pool_config(session, cfg));
	WT_ERR(__wt_cache_config(conn, raw_cfg));

	WT_ERR(__conn_verbose_config(session, raw_cfg));

	/* Wake up the cache pool server so any changes are noticed. */
	if (F_ISSET(conn, WT_CONN_CACHE_POOL))
		WT_ERR(__wt_cond_signal(
		    session, __wt_process.cache_pool->cache_pool_cond));

err:	API_END(session);
	return (ret);
}
예제 #10
0
파일: meta_turtle.c 프로젝트: GYGit/mongo
/*
 * __metadata_load_bulk --
 *	Create any bulk-loaded file stubs.
 */
static int
__metadata_load_bulk(WT_SESSION_IMPL *session)
{
	WT_CURSOR *cursor;
	WT_DECL_RET;
	uint32_t allocsize;
	bool exist;
	const char *filecfg[] = {
	    WT_CONFIG_BASE(session, file_meta), NULL, NULL };
	const char *key, *value;

	/*
	 * If a file was being bulk-loaded during the hot backup, it will appear
	 * in the metadata file, but the file won't exist.  Create on demand.
	 */
	WT_RET(__wt_metadata_cursor(session, &cursor));
	while ((ret = cursor->next(cursor)) == 0) {
		WT_ERR(cursor->get_key(cursor, &key));
		if (!WT_PREFIX_SKIP(key, "file:"))
			continue;

		/* If the file exists, it's all good. */
		WT_ERR(__wt_fs_exist(session, key, &exist));
		if (exist)
			continue;

		/*
		 * If the file doesn't exist, assume it's a bulk-loaded file;
		 * retrieve the allocation size and re-create the file.
		 */
		WT_ERR(cursor->get_value(cursor, &value));
		filecfg[1] = value;
		WT_ERR(__wt_direct_io_size_check(
		    session, filecfg, "allocation_size", &allocsize));
		WT_ERR(__wt_block_manager_create(session, key, allocsize));
	}
	WT_ERR_NOTFOUND_OK(ret);

err:	WT_TRET(__wt_metadata_cursor_release(session, &cursor));
	return (ret);
}
예제 #11
0
/*
 * __fill_index --
 *	Fill the index from the current contents of the table.
 */
static int
__fill_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx)
{
	WT_DECL_RET;
	WT_CURSOR *tcur, *icur;
	WT_SESSION *wt_session;

	wt_session = &session->iface;
	tcur = NULL;
	icur = NULL;
	WT_RET(__wt_schema_open_colgroups(session, table));

	/*
	 * If the column groups have not been completely created,
	 * there cannot be data inserted yet, and we're done.
	 */
	if (!table->cg_complete)
		return (0);

	WT_ERR(wt_session->open_cursor(wt_session,
	    idx->source, NULL, "bulk=unordered", &icur));
	WT_ERR(wt_session->open_cursor(wt_session,
	    table->name, NULL, "readonly", &tcur));

	while ((ret = tcur->next(tcur)) == 0)
		WT_ERR(__wt_apply_single_idx(session, idx,
		    icur, (WT_CURSOR_TABLE *)tcur, icur->insert));

	WT_ERR_NOTFOUND_OK(ret);
err:
	if (icur)
		WT_TRET(icur->close(icur));
	if (tcur)
		WT_TRET(tcur->close(tcur));
	return (ret);
}
예제 #12
0
파일: cur_backup.c 프로젝트: DINKIN/mongo
/*
 * __backup_uri --
 *	Backup a list of objects.
 */
static int
__backup_uri(WT_SESSION_IMPL *session,
    const char *cfg[], bool *foundp, bool *log_only)
{
	WT_CONFIG targetconf;
	WT_CONFIG_ITEM cval, k, v;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	bool target_list;
	const char *uri;

	*foundp = *log_only = false;

	/*
	 * If we find a non-empty target configuration string, we have a job,
	 * otherwise it's not our problem.
	 */
	WT_RET(__wt_config_gets(session, cfg, "target", &cval));
	__wt_config_subinit(session, &targetconf, &cval);
	for (target_list = false;
	    (ret = __wt_config_next(&targetconf, &k, &v)) == 0;
	    target_list = true) {
		/* If it is our first time through, allocate. */
		if (!target_list) {
			*foundp = true;
			WT_ERR(__wt_scr_alloc(session, 512, &tmp));
		}

		WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
		uri = tmp->data;
		if (v.len != 0)
			WT_ERR_MSG(session, EINVAL,
			    "%s: invalid backup target: URIs may need quoting",
			    uri);

		/*
		 * Handle log targets. We do not need to go through the schema
		 * worker, just call the function to append them. Set log_only
		 * only if it is our only URI target.
		 */
		if (WT_PREFIX_MATCH(uri, "log:")) {
			/*
			 * Log archive cannot mix with incremental backup, don't
			 * let that happen.
			 */
			if (FLD_ISSET(
			    S2C(session)->log_flags, WT_CONN_LOG_ARCHIVE))
				WT_ERR_MSG(session, EINVAL,
				    "incremental backup not possible when "
				    "automatic log archival configured");
			*log_only = !target_list;
			WT_ERR(__backup_log_append(
			    session, session->bkp_cursor, false));
		} else {
			*log_only = false;
			WT_ERR(__wt_schema_worker(session,
			    uri, NULL, __backup_list_uri_append, cfg, 0));
		}
	}
	WT_ERR_NOTFOUND_OK(ret);

err:	__wt_scr_free(session, &tmp);
	return (ret);
}
예제 #13
0
/*
 * __config_merge_scan --
 *	Walk a configuration string, inserting entries into the merged array.
 */
static int
__config_merge_scan(WT_SESSION_IMPL *session,
    const char *key, const char *value, WT_CONFIG_MERGE *cp)
{
	WT_CONFIG cparser;
	WT_CONFIG_ITEM k, v;
	WT_DECL_ITEM(kb);
	WT_DECL_ITEM(vb);
	WT_DECL_RET;
	size_t len;

	WT_ERR(__wt_scr_alloc(session, 0, &kb));
	WT_ERR(__wt_scr_alloc(session, 0, &vb));

	WT_ERR(__wt_config_init(session, &cparser, value));
	while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
		if (k.type != WT_CONFIG_ITEM_STRING &&
		    k.type != WT_CONFIG_ITEM_ID)
			WT_ERR_MSG(session, EINVAL,
			    "Invalid configuration key found: '%s'\n", k.str);

		/* Include the quotes around string keys/values. */
		if (k.type == WT_CONFIG_ITEM_STRING) {
			--k.str;
			k.len += 2;
		}
		if (v.type == WT_CONFIG_ITEM_STRING) {
			--v.str;
			v.len += 2;
		}

		/*
		 * !!!
		 * We're using a JSON quote character to separate the names we
		 * create for nested structures. That's not completely safe as
		 * it's possible to quote characters in JSON such that a quote
		 * character appears as a literal character in a key name. In
		 * a few cases, applications can create their own key namespace
		 * (for example, shared library extension names), and therefore
		 * it's possible for an application to confuse us. Error if we
		 * we ever see a key with a magic character.
		 */
		for (len = 0; len < k.len; ++len)
			if (k.str[len] == SEPC)
				WT_ERR_MSG(session, EINVAL,
				    "key %.*s contains a '%c' separator "
				    "character",
				    (int)k.len, (char *)k.str, SEPC);

		/* Build the key/value strings. */
		WT_ERR(__wt_buf_fmt(session,
		    kb, "%s%s%.*s",
		    key == NULL ? "" : key,
		    key == NULL ? "" : SEP,
		    (int)k.len, k.str));
		WT_ERR(__wt_buf_fmt(session,
		    vb, "%.*s", (int)v.len, v.str));

		/*
		 * If the value is a structure, recursively parse it.
		 *
		 * !!!
		 * Don't merge unless the structure has field names. WiredTiger
		 * stores checkpoint LSNs in the metadata file using nested
		 * structures without field names: "checkpoint_lsn=(1,0)", not
		 * "checkpoint_lsn=(file=1,offset=0)". The value type is still
		 * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the
		 * value.
		 */
		if (v.type == WT_CONFIG_ITEM_STRUCT &&
		    strchr(vb->data, '=') != NULL) {
			WT_ERR(__config_merge_scan(
			    session, kb->data, vb->data, cp));
			continue;
		}

		/* Insert the value into the array. */
		WT_ERR(__wt_realloc_def(session,
		    &cp->entries_allocated,
		    cp->entries_next + 1, &cp->entries));
		WT_ERR(__wt_strndup(session,
		    kb->data, kb->size, &cp->entries[cp->entries_next].k));
		WT_ERR(__wt_strndup(session,
		    vb->data, vb->size, &cp->entries[cp->entries_next].v));
		cp->entries[cp->entries_next].gen = cp->entries_next;
		++cp->entries_next;
	}
	WT_ERR_NOTFOUND_OK(ret);

err:	__wt_scr_free(session, &kb);
	__wt_scr_free(session, &vb);
	return (ret);
}
예제 #14
0
/*
 * wiredtiger_open --
 *	Main library entry point: open a new connection to a WiredTiger
 *	database.
 */
int
wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
    const char *config, WT_CONNECTION **wt_connp)
{
	static WT_CONNECTION stdc = {
		__conn_close,
		__conn_reconfigure,
		__conn_get_home,
		__conn_is_new,
		__conn_open_session,
		__conn_load_extension,
		__conn_add_data_source,
		__conn_add_collator,
		__conn_add_compressor,
		__conn_add_extractor
	};
	static struct {
		const char *name;
		uint32_t flag;
	} *ft, directio_types[] = {
		{ "data",	WT_DIRECTIO_DATA },
		{ "log",	WT_DIRECTIO_LOG },
		{ NULL, 0 }
	};
	WT_CONFIG subconfig;
	WT_CONFIG_ITEM cval, skey, sval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_ITEM *cbuf, expath, exconfig;
	WT_SESSION_IMPL *session;
	const char *cfg[] =
	    { __wt_confdfl_wiredtiger_open, config, NULL, NULL, NULL };
	int exist;

	*wt_connp = NULL;
	session = NULL;
	cbuf = NULL;
	WT_CLEAR(expath);
	WT_CLEAR(exconfig);

	WT_RET(__wt_library_init());

	WT_RET(__wt_calloc_def(NULL, 1, &conn));
	conn->iface = stdc;

	/*
	 * Immediately link the structure into the connection structure list:
	 * the only thing ever looked at on that list is the database name,
	 * and a NULL value is fine.
	 */
	__wt_spin_lock(NULL, &__wt_process.spinlock);
	TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q);
	__wt_spin_unlock(NULL, &__wt_process.spinlock);

	session = conn->default_session = &conn->dummy_session;
	session->iface.connection = &conn->iface;
	session->name = "wiredtiger_open";
	__wt_event_handler_set(session, event_handler);

	/* Remaining basic initialization of the connection structure. */
	WT_ERR(__wt_connection_init(conn));

	/* Check the configuration strings. */
	WT_ERR(__wt_config_check(
	    session, __wt_confchk_wiredtiger_open, config, 0));

	/* Get the database home. */
	WT_ERR(__conn_home(session, home, cfg));

	/* Make sure no other thread of control already owns this database. */
	WT_ERR(__conn_single(session, cfg));

	/* Read the database-home configuration file. */
	WT_ERR(__conn_config_file(session, cfg, &cbuf));

	/* Read the environment variable configuration. */
	WT_ERR(__conn_config_env(session, cfg));

	WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval));
	conn->hazard_max = (uint32_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval));
	conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS;
	WT_ERR(__wt_config_gets(session, cfg, "lsm_merge", &cval));
	if (cval.val)
		F_SET(conn, WT_CONN_LSM_MERGE);
	WT_ERR(__wt_config_gets(session, cfg, "sync", &cval));
	if (cval.val)
		F_SET(conn, WT_CONN_SYNC);
	WT_ERR(__wt_config_gets(session, cfg, "transactional", &cval));
	if (cval.val)
		F_SET(conn, WT_CONN_TRANSACTIONAL);

	/* Configure verbose flags. */
	WT_ERR(__conn_verbose_config(session, cfg));

	WT_ERR(__wt_conn_cache_pool_config(session, cfg));

	WT_ERR(__wt_config_gets(session, cfg, "logging", &cval));
	if (cval.val != 0)
		WT_ERR(__wt_open(
		   session, WT_LOG_FILENAME, 1, 0, 0, &conn->log_fh));

	/* Configure direct I/O and buffer alignment. */
	WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval));
	if (cval.val == -1)
		conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT;
	else
		conn->buffer_alignment = (size_t)cval.val;
#ifndef HAVE_POSIX_MEMALIGN
	if (conn->buffer_alignment != 0)
		WT_ERR_MSG(session, EINVAL,
		    "buffer_alignment requires posix_memalign");
#endif

	/*
	 * Configuration: direct_io, mmap, statistics.
	 */
	WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval));
	for (ft = directio_types; ft->name != NULL; ft++) {
		ret = __wt_config_subgets(session, &cval, ft->name, &sval);
		if (ret == 0) {
			if (sval.val)
				FLD_SET(conn->direct_io, ft->flag);
		} else if (ret != WT_NOTFOUND)
			goto err;
	}
	WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
	conn->mmap = cval.val == 0 ? 0 : 1;
	WT_ERR(__wt_config_gets(session, cfg, "statistics", &cval));
	conn->statistics = cval.val == 0 ? 0 : 1;

	/* Load any extensions referenced in the config. */
	WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval));
	WT_ERR(__wt_config_subinit(session, &subconfig, &cval));
	while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) {
		WT_ERR(__wt_buf_fmt(
		    session, &expath, "%.*s", (int)skey.len, skey.str));
		if (sval.len > 0)
			WT_ERR(__wt_buf_fmt(session, &exconfig,
			    "entry=%.*s\n", (int)sval.len, sval.str));
		WT_ERR(conn->iface.load_extension(&conn->iface,
		    expath.data, (sval.len > 0) ? exconfig.data : NULL));
	}
	WT_ERR_NOTFOUND_OK(ret);

	/*
	 * Open the connection; if that fails, the connection handle has been
	 * destroyed by the time the open function returns.
	 */
	if ((ret = __wt_connection_open(conn, cfg)) != 0) {
		conn = NULL;
		WT_ERR(ret);
	}

	/* Open the default session. */
	WT_ERR(__wt_open_session(conn, 1, NULL, NULL, &conn->default_session));
	session = conn->default_session;

	/*
	 * Check on the turtle and metadata files, creating them if necessary
	 * (which avoids application threads racing to create the metadata file
	 * later).
	 */
	WT_ERR(__wt_meta_turtle_init(session, &exist));
	if (!exist) {
		/*
		 * We're single-threaded, but acquire the schema lock
		 * regardless: the lower level code checks that it is
		 * appropriately synchronized.
		 */
		WT_WITH_SCHEMA_LOCK(session,
		    ret = __wt_schema_create(session, WT_METADATA_URI, NULL));
		WT_ERR(ret);
	}
	WT_ERR(__wt_metadata_open(session));

	/* If there's a hot-backup file, load it. */
	WT_ERR(__wt_metadata_load_backup(session));

	/*
	 * XXX LSM initialization.
	 * This is structured so that it could be moved to an extension.
	 */
	WT_ERR(__wt_lsm_init(&conn->iface, NULL));

	STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
	*wt_connp = &conn->iface;

	/*
	 * Destroying the connection on error will destroy our session handle,
	 * cleanup using the session handle first, then discard the connection.
	 */
err:	if (cbuf != NULL)
		__wt_buf_free(session, cbuf);
	__wt_buf_free(session, &expath);
	__wt_buf_free(session, &exconfig);

	if (ret != 0 && conn != NULL)
		WT_TRET(__wt_connection_destroy(conn));

	/* Let the server threads proceed. */
	if (ret == 0)
		conn->connection_initialized = 1;

	return (ret);
}
예제 #15
0
/*
 * __wt_lsm_merge --
 *	Merge a set of chunks of an LSM tree.
 */
int
__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
{
	WT_BLOOM *bloom;
	WT_CURSOR *dest, *src;
	WT_DECL_RET;
	WT_ITEM key, value;
	WT_LSM_CHUNK *chunk;
	uint32_t generation;
	uint64_t insert_count, record_count;
	u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb;
	int tret;
	bool created_chunk, create_bloom, locked, in_sync;
	const char *cfg[3];
	const char *drop_cfg[] =
	    { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL };

	bloom = NULL;
	chunk = NULL;
	dest = src = NULL;
	start_id = 0;
	created_chunk = create_bloom = locked = in_sync = false;

	/* Fast path if it's obvious no merges could be done. */
	if (lsm_tree->nchunks < lsm_tree->merge_min &&
	    lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD)
		return (WT_NOTFOUND);

	/*
	 * Use the lsm_tree lock to read the chunks (so no switches occur), but
	 * avoid holding it while the merge is in progress: that may take a
	 * long time.
	 */
	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = true;

	WT_ERR(__lsm_merge_span(session,
	    lsm_tree, id, &start_chunk, &end_chunk, &record_count));
	nchunks = (end_chunk + 1) - start_chunk;

	WT_ASSERT(session, nchunks > 0);
	start_id = lsm_tree->chunk[start_chunk]->id;

	/* Find the merge generation. */
	for (generation = 0, i = 0; i < nchunks; i++)
		generation = WT_MAX(generation,
		    lsm_tree->chunk[start_chunk + i]->generation + 1);

	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
	locked = false;

	/* Allocate an ID for the merge. */
	dest_id = __wt_atomic_add32(&lsm_tree->last, 1);

	/*
	 * We only want to do the chunk loop if we're running with verbose,
	 * so we wrap these statements in the conditional.  Avoid the loop
	 * in the normal path.
	 */
	if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) {
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "Merging %s chunks %u-%u into %u (%" PRIu64 " records)"
		    ", generation %" PRIu32,
		    lsm_tree->name,
		    start_chunk, end_chunk, dest_id, record_count, generation));
		for (verb = start_chunk; verb <= end_chunk; verb++)
			WT_ERR(__wt_verbose(session, WT_VERB_LSM,
			    "Merging %s: Chunk[%u] id %u, gen: %" PRIu32
			    ", size: %" PRIu64 ", records: %" PRIu64,
			    lsm_tree->name, verb, lsm_tree->chunk[verb]->id,
			    lsm_tree->chunk[verb]->generation,
			    lsm_tree->chunk[verb]->size,
			    lsm_tree->chunk[verb]->count));
	}

	WT_ERR(__wt_calloc_one(session, &chunk));
	created_chunk = true;
	chunk->id = dest_id;

	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) &&
	    (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
	    start_chunk > 0) && record_count > 0)
		create_bloom = true;

	/*
	 * Special setup for the merge cursor:
	 * first, reset to open the dependent cursors;
	 * then restrict the cursor to a specific number of chunks;
	 * then set MERGE so the cursor doesn't track updates to the tree.
	 */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));

	WT_WITH_SCHEMA_LOCK(session,
	    ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
	WT_ERR(ret);
	if (create_bloom) {
		WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

		WT_ERR(__wt_bloom_create(session, chunk->bloom_uri,
		    lsm_tree->bloom_config,
		    record_count, lsm_tree->bloom_bit_count,
		    lsm_tree->bloom_hash_count, &bloom));
	}

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
	cfg[1] = "bulk,raw,skip_sort_check";
	cfg[2] = NULL;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));

#define	LSM_MERGE_CHECK_INTERVAL	WT_THOUSAND
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
			if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
				WT_ERR(EINTR);

			WT_STAT_FAST_CONN_INCRV(session,
			    lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
			++lsm_tree->merge_progressing;
		}

		WT_ERR(src->get_key(src, &key));
		dest->set_key(dest, &key);
		WT_ERR(src->get_value(src, &value));
		dest->set_value(dest, &value);
		WT_ERR(dest->insert(dest));
		if (create_bloom)
			WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_STAT_FAST_CONN_INCRV(session,
	    lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
	++lsm_tree->merge_progressing;
	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
	    "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.",
	    record_count, insert_count));

	/*
	 * Closing and syncing the files can take a while.  Set the
	 * merge_syncing field so that compact knows it is still in
	 * progress.
	 */
	(void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1);
	in_sync = true;
	/*
	 * We've successfully created the new chunk.  Now install it.  We need
	 * to ensure that the NO_CACHE flag is cleared and the bloom filter
	 * is closed (even if a step fails), so track errors but don't return
	 * until we've cleaned up.
	 */
	WT_TRET(src->close(src));
	WT_TRET(dest->close(dest));
	src = dest = NULL;

	F_CLR(session, WT_SESSION_NO_CACHE);

	/*
	 * We're doing advisory reads to fault the new trees into cache.
	 * Don't block if the cache is full: our next unit of work may be to
	 * discard some trees to free space.
	 */
	F_SET(session, WT_SESSION_NO_EVICTION);

	if (create_bloom) {
		if (ret == 0)
			WT_TRET(__wt_bloom_finalize(bloom));

		/*
		 * Read in a key to make sure the Bloom filters btree handle is
		 * open before it becomes visible to application threads.
		 * Otherwise application threads will stall while it is opened
		 * and internal pages are read into cache.
		 */
		if (ret == 0) {
			WT_CLEAR(key);
			WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
		}

		WT_TRET(__wt_bloom_close(bloom));
		bloom = NULL;
	}
	WT_ERR(ret);

	/*
	 * Open a handle on the new chunk before application threads attempt
	 * to access it, opening it pre-loads internal pages into the file
	 * system cache.
	 */
	cfg[1] = "checkpoint=" WT_CHECKPOINT;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
	WT_TRET(dest->close(dest));
	dest = NULL;
	++lsm_tree->merge_progressing;
	(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	in_sync = false;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = true;

	/*
	 * Check whether we raced with another merge, and adjust the chunk
	 * array offset as necessary.
	 */
	if (start_chunk >= lsm_tree->nchunks ||
	    lsm_tree->chunk[start_chunk]->id != start_id)
		for (start_chunk = 0;
		    start_chunk < lsm_tree->nchunks;
		    start_chunk++)
			if (lsm_tree->chunk[start_chunk]->id == start_id)
				break;

	/*
	 * It is safe to error out here - since the update can only fail
	 * prior to making updates to the tree.
	 */
	WT_ERR(__wt_lsm_merge_update_tree(
	    session, lsm_tree, start_chunk, nchunks, chunk));

	if (create_bloom)
		F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	chunk->count = insert_count;
	chunk->generation = generation;
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);

	/*
	 * We have no current way of continuing if the metadata update fails,
	 * so we will panic in that case.  Put some effort into cleaning up
	 * after ourselves here - so things have a chance of shutting down.
	 *
	 * Any errors that happened after the tree was locked are
	 * fatal - we can't guarantee the state of the tree.
	 */
	if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0)
		WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");

	lsm_tree->dsk_gen++;

	/* Update the throttling while holding the tree lock. */
	__wt_lsm_tree_throttle(session, lsm_tree, true);

	/* Schedule a pass to discard old chunks */
	WT_ERR(__wt_lsm_manager_push_entry(
	    session, WT_LSM_WORK_DROP, 0, lsm_tree));

err:	if (locked)
		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
	if (in_sync)
		(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	if (src != NULL)
		WT_TRET(src->close(src));
	if (dest != NULL)
		WT_TRET(dest->close(dest));
	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	if (ret != 0 && created_chunk) {
		/* Drop the newly-created files on error. */
		if (chunk->uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session, tret =
			    __wt_schema_drop(session, chunk->uri, drop_cfg));
			WT_TRET(tret);
		}
		if (create_bloom && chunk->bloom_uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session,
			    tret = __wt_schema_drop(
			    session, chunk->bloom_uri, drop_cfg));
			WT_TRET(tret);
		}
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, chunk);

		if (ret == EINTR)
			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
			    "Merge aborted due to close"));
		else
			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
			    "Merge failed with %s",
			   __wt_strerror(session, ret, NULL, 0)));
	}
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	return (ret);
}
예제 #16
0
/*
 * __txn_op_apply --
 *	Apply a transactional operation during recovery.
 */
static int
__txn_op_apply(
    WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
{
	WT_CURSOR *cursor, *start, *stop;
	WT_DECL_RET;
	WT_ITEM key, start_key, stop_key, value;
	WT_SESSION_IMPL *session;
	uint64_t recno, start_recno, stop_recno;
	uint32_t fileid, mode, optype, opsize;

	session = r->session;
	cursor = NULL;

	/* Peek at the size and the type. */
	WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
	end = *pp + opsize;

	switch (optype) {
	case WT_LOGOP_COL_MODIFY:
		WT_ERR(__wt_logop_col_modify_unpack(session, pp, end,
		    &fileid, &recno, &value));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		cursor->set_key(cursor, recno);
		if ((ret = cursor->search(cursor)) != 0)
			WT_ERR_NOTFOUND_OK(ret);
		else {
			/*
			 * Build/insert a complete value during recovery rather
			 * than using cursor modify to create a partial update
			 * (for no particular reason than simplicity).
			 */
			WT_ERR(__wt_modify_apply(session, cursor, value.data));
			WT_ERR(cursor->insert(cursor));
		}
		break;

	case WT_LOGOP_COL_PUT:
		WT_ERR(__wt_logop_col_put_unpack(session, pp, end,
		    &fileid, &recno, &value));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		cursor->set_key(cursor, recno);
		__wt_cursor_set_raw_value(cursor, &value);
		WT_ERR(cursor->insert(cursor));
		break;

	case WT_LOGOP_COL_REMOVE:
		WT_ERR(__wt_logop_col_remove_unpack(session, pp, end,
		    &fileid, &recno));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		cursor->set_key(cursor, recno);
		WT_ERR(cursor->remove(cursor));
		break;

	case WT_LOGOP_COL_TRUNCATE:
		WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end,
		    &fileid, &start_recno, &stop_recno));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);

		/* Set up the cursors. */
		if (start_recno == WT_RECNO_OOB) {
			start = NULL;
			stop = cursor;
		} else if (stop_recno == WT_RECNO_OOB) {
			start = cursor;
			stop = NULL;
		} else {
			start = cursor;
			WT_ERR(__recovery_cursor(
			    session, r, lsnp, fileid, true, &stop));
		}

		/* Set the keys. */
		if (start != NULL)
			start->set_key(start, start_recno);
		if (stop != NULL)
			stop->set_key(stop, stop_recno);

		WT_TRET(session->iface.truncate(&session->iface, NULL,
		    start, stop, NULL));
		/* If we opened a duplicate cursor, close it now. */
		if (stop != NULL && stop != cursor)
			WT_TRET(stop->close(stop));
		WT_ERR(ret);
		break;

	case WT_LOGOP_ROW_MODIFY:
		WT_ERR(__wt_logop_row_modify_unpack(session, pp, end,
		    &fileid, &key, &value));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		__wt_cursor_set_raw_key(cursor, &key);
		if ((ret = cursor->search(cursor)) != 0)
			WT_ERR_NOTFOUND_OK(ret);
		else {
			/*
			 * Build/insert a complete value during recovery rather
			 * than using cursor modify to create a partial update
			 * (for no particular reason than simplicity).
			 */
			WT_ERR(__wt_modify_apply(session, cursor, value.data));
			WT_ERR(cursor->insert(cursor));
		}
		break;

	case WT_LOGOP_ROW_PUT:
		WT_ERR(__wt_logop_row_put_unpack(session, pp, end,
		    &fileid, &key, &value));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		__wt_cursor_set_raw_key(cursor, &key);
		__wt_cursor_set_raw_value(cursor, &value);
		WT_ERR(cursor->insert(cursor));
		break;

	case WT_LOGOP_ROW_REMOVE:
		WT_ERR(__wt_logop_row_remove_unpack(session, pp, end,
		    &fileid, &key));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		__wt_cursor_set_raw_key(cursor, &key);
		WT_ERR(cursor->remove(cursor));
		break;

	case WT_LOGOP_ROW_TRUNCATE:
		WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end,
		    &fileid, &start_key, &stop_key, &mode));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		/* Set up the cursors. */
		start = stop = NULL;
		switch (mode) {
		case WT_TXN_TRUNC_ALL:
			/* Both cursors stay NULL. */
			break;
		case WT_TXN_TRUNC_BOTH:
			start = cursor;
			WT_ERR(__recovery_cursor(
			    session, r, lsnp, fileid, true, &stop));
			break;
		case WT_TXN_TRUNC_START:
			start = cursor;
			break;
		case WT_TXN_TRUNC_STOP:
			stop = cursor;
			break;

		WT_ILLEGAL_VALUE_ERR(session, mode);
		}

		/* Set the keys. */
		if (start != NULL)
			__wt_cursor_set_raw_key(start, &start_key);
		if (stop != NULL)
			__wt_cursor_set_raw_key(stop, &stop_key);

		WT_TRET(session->iface.truncate(&session->iface, NULL,
		    start, stop, NULL));
		/* If we opened a duplicate cursor, close it now. */
		if (stop != NULL && stop != cursor)
			WT_TRET(stop->close(stop));
		WT_ERR(ret);
		break;

	WT_ILLEGAL_VALUE_ERR(session, optype);
	}

	/* Reset the cursor so it doesn't block eviction. */
	if (cursor != NULL)
		WT_ERR(cursor->reset(cursor));
	return (0);

err:	__wt_err(session, ret,
	    "operation apply failed during recovery: operation type %"
	    PRIu32 " at LSN %" PRIu32 "/%" PRIu32,
	    optype, lsnp->l.file, lsnp->l.offset);
	return (ret);
}
예제 #17
0
/*
 * __open_index --
 *	Open an index.
 */
static int
__open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx)
{
	WT_CONFIG colconf;
	WT_CONFIG_ITEM ckey, cval, metadata;
	WT_DECL_ITEM(buf);
	WT_DECL_ITEM(plan);
	WT_DECL_RET;
	u_int npublic_cols, i;

	WT_ERR(__wt_scr_alloc(session, 0, &buf));

	/* Get the data source from the index config. */
	WT_ERR(__wt_config_getones(session, idx->config, "source", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len, &idx->source));

	WT_ERR(__wt_config_getones(session, idx->config, "immutable", &cval));
	if (cval.val)
		F_SET(idx, WT_INDEX_IMMUTABLE);

	/*
	 * Compatibility: we didn't always maintain collator information in
	 * index metadata, cope when it isn't found.
	 */
	WT_CLEAR(cval);
	WT_ERR_NOTFOUND_OK(__wt_config_getones(
	    session, idx->config, "collator", &cval));
	if (cval.len != 0) {
		WT_CLEAR(metadata);
		WT_ERR_NOTFOUND_OK(__wt_config_getones(
		    session, idx->config, "app_metadata", &metadata));
		WT_ERR(__wt_collator_config(
		    session, idx->name, &cval, &metadata,
		    &idx->collator, &idx->collator_owned));
	}

	WT_ERR(__wt_extractor_config(
	    session, idx->name, idx->config, &idx->extractor,
	    &idx->extractor_owned));

	WT_ERR(__wt_config_getones(session, idx->config, "key_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len, &idx->key_format));

	/*
	 * The key format for an index is somewhat subtle: the application
	 * specifies a set of columns that it will use for the key, but the
	 * engine usually adds some hidden columns in order to derive the
	 * primary key.  These hidden columns are part of the file's key.
	 *
	 * The file's key_format is stored persistently, we need to calculate
	 * the index cursor key format (which will usually omit some of those
	 * keys).
	 */
	WT_ERR(__wt_buf_init(session, buf, 0));
	WT_ERR(__wt_config_getones(
	    session, idx->config, "columns", &idx->colconf));

	/* Start with the declared index columns. */
	WT_ERR(__wt_config_subinit(session, &colconf, &idx->colconf));
	for (npublic_cols = 0;
	    (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0;
	    ++npublic_cols)
		WT_ERR(__wt_buf_catfmt(
		    session, buf, "%.*s,", (int)ckey.len, ckey.str));
	if (ret != WT_NOTFOUND)
		goto err;

	/*
	 * If we didn't find any columns, the index must have an extractor.
	 * We don't rely on this unconditionally because it was only added to
	 * the metadata after version 2.3.1.
	 */
	if (npublic_cols == 0) {
		WT_ERR(__wt_config_getones(
		    session, idx->config, "index_key_columns", &cval));
		npublic_cols = (u_int)cval.val;
		WT_ASSERT(session, npublic_cols != 0);
		for (i = 0; i < npublic_cols; i++)
			WT_ERR(__wt_buf_catfmt(session, buf, "\"bad col\","));
	}

	/*
	 * Now add any primary key columns from the table that are not
	 * already part of the index key.
	 */
	WT_ERR(__wt_config_subinit(session, &colconf, &table->colconf));
	for (i = 0; i < table->nkey_columns &&
	    (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0;
	    i++) {
		/*
		 * If the primary key column is already in the secondary key,
		 * don't add it again.
		 */
		if (__wt_config_subgetraw(
		    session, &idx->colconf, &ckey, &cval) == 0)
			continue;
		WT_ERR(__wt_buf_catfmt(
		    session, buf, "%.*s,", (int)ckey.len, ckey.str));
	}
	WT_ERR_NOTFOUND_OK(ret);

	/*
	 * If the table doesn't yet have its column groups, don't try to
	 * calculate a plan: we are just checking that the index creation is
	 * sane.
	 */
	if (!table->cg_complete)
		goto err;

	WT_ERR(__wt_scr_alloc(session, 0, &plan));
	WT_ERR(__wt_struct_plan(
	    session, table, buf->data, buf->size, false, plan));
	WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->key_plan));

	/* Set up the cursor key format (the visible columns). */
	WT_ERR(__wt_buf_init(session, buf, 0));
	WT_ERR(__wt_struct_truncate(session,
	    idx->key_format, npublic_cols, buf));
	WT_ERR(__wt_strndup(
	    session, buf->data, buf->size, &idx->idxkey_format));

	/*
	 * Add a trailing padding byte to the format.  This ensures that there
	 * will be no special optimization of the last column, so the primary
	 * key columns can be simply appended.
	 */
	WT_ERR(__wt_buf_catfmt(session, buf, "x"));
	WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->exkey_format));

	/* By default, index cursor values are the table value columns. */
	/* TODO Optimize to use index columns in preference to table lookups. */
	WT_ERR(__wt_buf_init(session, plan, 0));
	WT_ERR(__wt_struct_plan(session,
	    table, table->colconf.str, table->colconf.len, true, plan));
	WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->value_plan));

err:	__wt_scr_free(session, &buf);
	__wt_scr_free(session, &plan);
	return (ret);
}
예제 #18
0
/*
 * __wt_las_sweep --
 *	Sweep the lookaside table.
 */
int
__wt_las_sweep(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *cursor;
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_RET;
	WT_ITEM *key;
	uint64_t cnt, las_counter, las_txnid;
	int64_t remove_cnt;
	uint32_t las_id, session_flags;
	int notused;

	conn = S2C(session);
	cursor = NULL;
	key = &conn->las_sweep_key;
	remove_cnt = 0;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));

	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * If we're not starting a new sweep, position the cursor using the key
	 * from the last call (we don't care if we're before or after the key,
	 * just roughly in the same spot is fine).
	 */
	if (key->size != 0) {
		__wt_cursor_set_raw_key(cursor, key);
		ret = cursor->search_near(cursor, &notused);

		/*
		 * Don't search for the same key twice; if we don't set a new
		 * key below, it's because we've reached the end of the table
		 * and we want the next pass to start at the beginning of the
		 * table. Searching for the same key could leave us stuck at
		 * the end of the table, repeatedly checking the same rows.
		 */
		key->size = 0;
		if (ret != 0)
			goto srch_notfound;
	}

	/*
	 * The sweep server wakes up every 10 seconds (by default), it's a slow
	 * moving thread. Try to review the entire lookaside table once every 5
	 * minutes, or every 30 calls.
	 *
	 * The reason is because the lookaside table exists because we're seeing
	 * cache/eviction pressure (it allows us to trade performance and disk
	 * space for cache space), and it's likely lookaside blocks are being
	 * evicted, and reading them back in doesn't help things. A trickier,
	 * but possibly better, alternative might be to review all lookaside
	 * blocks in the cache in order to get rid of them, and slowly review
	 * lookaside blocks that have already been evicted.
	 */
	cnt = (uint64_t)WT_MAX(100, conn->las_record_cnt / 30);

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	/* Walk the file. */
	for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
		/*
		 * If the loop terminates after completing a work unit, we will
		 * continue the table sweep next time. Get a local copy of the
		 * sweep key, we're going to reset the cursor; do so before
		 * calling cursor.remove, cursor.remove can discard our hazard
		 * pointer and the page could be evicted from underneath us.
		 */
		if (cnt == 1) {
			WT_ERR(__wt_cursor_get_raw_key(cursor, key));
			if (!WT_DATA_IN_ITEM(key))
				WT_ERR(__wt_buf_set(
				    session, key, key->data, key->size));
		}

		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * If the on-page record transaction ID associated with the
		 * record is globally visible, the record can be discarded.
		 *
		 * Cursor opened overwrite=true: won't return WT_NOTFOUND should
		 * another thread remove the record before we do, and the cursor
		 * remains positioned in that case.
		 */
		if (__wt_txn_visible_all(session, las_txnid)) {
			WT_ERR(cursor->remove(cursor));
			++remove_cnt;
		}
	}

srch_notfound:
	WT_ERR_NOTFOUND_OK(ret);

	if (0) {
err:		__wt_buf_free(session, key);
	}

	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));

	/*
	 * If there were races to remove records, we can over-count.  All
	 * arithmetic is signed, so underflow isn't fatal, but check anyway so
	 * we don't skew low over time.
	 */
	if (remove_cnt > S2C(session)->las_record_cnt)
		S2C(session)->las_record_cnt = 0;
	else if (remove_cnt > 0)
		(void)__wt_atomic_subi64(&conn->las_record_cnt, remove_cnt);

	F_CLR(session, WT_SESSION_NO_CACHE);

	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);

	return (ret);
}
예제 #19
0
/*
 * __txn_rollback_to_stable_lookaside_fixup --
 *	Remove any updates that need to be rolled back from the lookaside file.
 */
static int
__txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_DECL_TIMESTAMP(rollback_timestamp)
	WT_ITEM las_addr, las_key, las_timestamp;
	WT_TXN_GLOBAL *txn_global;
	uint64_t las_counter, las_txnid, remove_cnt;
	uint32_t las_id, session_flags;

	conn = S2C(session);
	cursor = NULL;
	remove_cnt = 0;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */
	WT_CLEAR(las_timestamp);

	/*
	 * Copy the stable timestamp, otherwise we'd need to lock it each time
	 * it's accessed. Even though the stable timestamp isn't supposed to be
	 * updated while rolling back, accessing it without a lock would
	 * violate protocol.
	 */
	txn_global = &S2C(session)->txn_global;
	__wt_readlock(session, &txn_global->rwlock);
	__wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp);
	__wt_readunlock(session, &txn_global->rwlock);

	__wt_las_cursor(session, &cursor, &session_flags);

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	/* Walk the file. */
	for (; (ret = cursor->next(cursor)) == 0; ) {
		WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter,
		    &las_txnid, &las_timestamp, &las_key));

		/* Check the file ID so we can skip durable tables */
		if (__bit_test(conn->stable_rollback_bitstring, las_id))
			continue;

		/*
		 * Entries with no timestamp will have a timestamp of zero,
		 * which will fail the following check and cause them to never
		 * be removed.
		 */
		if (__wt_timestamp_cmp(
		    &rollback_timestamp, las_timestamp.data) < 0) {
			WT_ERR(cursor->remove(cursor));
			++remove_cnt;
		}
	}
	WT_ERR_NOTFOUND_OK(ret);
err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
	/*
	 * If there were races to remove records, we can over-count. Underflow
	 * isn't fatal, but check anyway so we don't skew low over time.
	 */
	if (remove_cnt > conn->las_record_cnt)
		conn->las_record_cnt = 0;
	else if (remove_cnt > 0)
		(void)__wt_atomic_sub64(&conn->las_record_cnt, remove_cnt);

	F_CLR(session, WT_SESSION_NO_CACHE);

	return (ret);
}
예제 #20
0
/*
 * __create_colgroup --
 *	Create a column group.
 */
static int
__create_colgroup(WT_SESSION_IMPL *session,
    const char *name, bool exclusive, const char *config)
{
	WT_CONFIG_ITEM cval;
	WT_DECL_RET;
	WT_ITEM confbuf, fmt, namebuf;
	WT_TABLE *table;
	size_t tlen;
	const char **cfgp, *cfg[4] =
	    { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL };
	const char *sourcecfg[] = { config, NULL, NULL };
	const char *cgname, *source, *sourceconf, *tablename;
	char *cgconf, *origconf;
	bool exists;

	sourceconf = NULL;
	cgconf = origconf = NULL;
	WT_CLEAR(fmt);
	WT_CLEAR(confbuf);
	WT_CLEAR(namebuf);
	exists = false;

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
		return (
		    __wt_unexpected_object_type(session, name, "colgroup:"));
	cgname = strchr(tablename, ':');
	if (cgname != NULL) {
		tlen = (size_t)(cgname - tablename);
		++cgname;
	} else
		tlen = strlen(tablename);

	if ((ret =
	    __wt_schema_get_table(session, tablename, tlen, true, &table)) != 0)
		WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret,
		    "Can't create '%s' for non-existent table '%.*s'",
		    name, (int)tlen, tablename);

	/* Make sure the column group is referenced from the table. */
	if (cgname != NULL && (ret =
	    __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0)
		WT_ERR_MSG(session, EINVAL,
		    "Column group '%s' not found in table '%.*s'",
		    cgname, (int)tlen, tablename);

	/* Check if the column group already exists. */
	if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) {
		if (exclusive)
			WT_ERR(EEXIST);
		exists = true;
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* Find the first NULL entry in the cfg stack. */
	for (cfgp = &cfg[1]; *cfgp; cfgp++)
		;

	/* Add the source to the colgroup config before collapsing. */
	if (__wt_config_getones(
	    session, config, "source", &cval) == 0 && cval.len != 0) {
		WT_ERR(__wt_buf_fmt(
		    session, &namebuf, "%.*s", (int)cval.len, cval.str));
		source = namebuf.data;
	} else {
		WT_ERR(__wt_schema_colgroup_source(
		    session, table, cgname, config, &namebuf));
		source = namebuf.data;
		WT_ERR(__wt_buf_fmt(
		    session, &confbuf, "source=\"%s\"", source));
		*cfgp++ = confbuf.data;
	}

	/* Calculate the key/value formats: these go into the source config. */
	WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format));
	if (cgname == NULL)
		WT_ERR(__wt_buf_catfmt
		    (session, &fmt, ",value_format=%s", table->value_format));
	else {
		if (__wt_config_getones(session, config, "columns", &cval) != 0)
			WT_ERR_MSG(session, EINVAL,
			    "No 'columns' configuration for '%s'", name);
		WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format="));
		WT_ERR(__wt_struct_reformat(session,
		    table, cval.str, cval.len, NULL, true, &fmt));
	}
	sourcecfg[1] = fmt.data;
	WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf));
	WT_ERR(__wt_schema_create(session, source, sourceconf));

	WT_ERR(__wt_config_collapse(session, cfg, &cgconf));

	if (!exists) {
		WT_ERR(__wt_metadata_insert(session, name, cgconf));
		WT_ERR(__wt_schema_open_colgroups(session, table));
	}

err:	__wt_free(session, cgconf);
	__wt_free(session, sourceconf);
	__wt_free(session, origconf);
	__wt_buf_free(session, &confbuf);
	__wt_buf_free(session, &fmt);
	__wt_buf_free(session, &namebuf);

	__wt_schema_release_table(session, table);
	return (ret);
}
예제 #21
0
/*
 * __wt_lsm_meta_read --
 *	Read the metadata for an LSM tree.
 */
int
__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_CONFIG cparser, lparser;
	WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	char *lsmconfig;
	u_int nchunks;

	chunk = NULL;			/* -Wconditional-uninitialized */

	/* LSM trees inherit the merge setting from the connection. */
	if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
		F_SET(lsm_tree, WT_LSM_TREE_MERGES);

	WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig));
	WT_ERR(__wt_config_init(session, &cparser, lsmconfig));
	while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
		if (WT_STRING_MATCH("key_format", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->key_format);
			WT_ERR(__wt_strndup(session,
			    cv.str, cv.len, &lsm_tree->key_format));
		} else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->value_format);
			WT_ERR(__wt_strndup(session,
			    cv.str, cv.len, &lsm_tree->value_format));
		} else if (WT_STRING_MATCH("collator", ck.str, ck.len)) {
			if (cv.len == 0 ||
			    WT_STRING_MATCH("none", cv.str, cv.len))
				continue;
			/*
			 * Extract the application-supplied metadata (if any)
			 * from the file configuration.
			 */
			WT_ERR(__wt_config_getones(
			    session, lsmconfig, "file_config", &fileconf));
			WT_CLEAR(metadata);
			WT_ERR_NOTFOUND_OK(__wt_config_subgets(
			    session, &fileconf, "app_metadata", &metadata));
			WT_ERR(__wt_collator_config(session, lsm_tree->name,
			    &cv, &metadata,
			    &lsm_tree->collator, &lsm_tree->collator_owned));
			WT_ERR(__wt_strndup(session,
			    cv.str, cv.len, &lsm_tree->collator_name));
		} else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->bloom_config);
			/* Don't include the brackets. */
			WT_ERR(__wt_strndup(session,
			    cv.str + 1, cv.len - 2, &lsm_tree->bloom_config));
		} else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->file_config);
			/* Don't include the brackets. */
			WT_ERR(__wt_strndup(session,
			    cv.str + 1, cv.len - 2, &lsm_tree->file_config));
		} else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) {
			if (cv.val)
				F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
			else
				F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
		} else if (WT_STRING_MATCH("bloom", ck.str, ck.len))
			lsm_tree->bloom = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len))
			lsm_tree->bloom_bit_count = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len))
			lsm_tree->bloom_hash_count = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("chunk_count_limit", ck.str, ck.len)) {
			lsm_tree->chunk_count_limit = (uint32_t)cv.val;
			if (cv.val != 0)
				F_CLR(lsm_tree, WT_LSM_TREE_MERGES);
		} else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len))
			lsm_tree->chunk_max = (uint64_t)cv.val;
		else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len))
			lsm_tree->chunk_size = (uint64_t)cv.val;
		else if (WT_STRING_MATCH("merge_max", ck.str, ck.len))
			lsm_tree->merge_max = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("merge_min", ck.str, ck.len))
			lsm_tree->merge_min = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("last", ck.str, ck.len))
			lsm_tree->last = (u_int)cv.val;
		else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) {
			WT_ERR(__wt_config_subinit(session, &lparser, &cv));
			for (nchunks = 0; (ret =
			    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
				if (WT_STRING_MATCH("id", lk.str, lk.len)) {
					WT_ERR(__wt_realloc_def(session,
					    &lsm_tree->chunk_alloc,
					    nchunks + 1, &lsm_tree->chunk));
					WT_ERR(
					    __wt_calloc_one(session, &chunk));
					lsm_tree->chunk[nchunks++] = chunk;
					chunk->id = (uint32_t)lv.val;
					WT_ERR(__wt_lsm_tree_chunk_name(session,
					    lsm_tree, chunk->id, &chunk->uri));
					F_SET(chunk,
					    WT_LSM_CHUNK_ONDISK |
					    WT_LSM_CHUNK_STABLE);
				} else if (WT_STRING_MATCH(
				    "bloom", lk.str, lk.len)) {
					WT_ERR(__wt_lsm_tree_bloom_name(
					    session, lsm_tree,
					    chunk->id, &chunk->bloom_uri));
					F_SET(chunk, WT_LSM_CHUNK_BLOOM);
					continue;
				} else if (WT_STRING_MATCH(
				    "chunk_size", lk.str, lk.len)) {
					chunk->size = (uint64_t)lv.val;
					continue;
				} else if (WT_STRING_MATCH(
				    "count", lk.str, lk.len)) {
					chunk->count = (uint64_t)lv.val;
					continue;
				} else if (WT_STRING_MATCH(
				    "generation", lk.str, lk.len)) {
					chunk->generation = (uint32_t)lv.val;
					continue;
				}
			}
			WT_ERR_NOTFOUND_OK(ret);
			lsm_tree->nchunks = nchunks;
		} else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) {
			WT_ERR(__wt_config_subinit(session, &lparser, &cv));
			for (nchunks = 0; (ret =
			    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
				if (WT_STRING_MATCH("bloom", lk.str, lk.len)) {
					WT_ERR(__wt_strndup(session,
					    lv.str, lv.len, &chunk->bloom_uri));
					F_SET(chunk, WT_LSM_CHUNK_BLOOM);
					continue;
				}
				WT_ERR(__wt_realloc_def(session,
				    &lsm_tree->old_alloc, nchunks + 1,
				    &lsm_tree->old_chunks));
				WT_ERR(__wt_calloc_one(session, &chunk));
				lsm_tree->old_chunks[nchunks++] = chunk;
				WT_ERR(__wt_strndup(session,
				    lk.str, lk.len, &chunk->uri));
				F_SET(chunk, WT_LSM_CHUNK_ONDISK);
			}
			WT_ERR_NOTFOUND_OK(ret);
			lsm_tree->nold_chunks = nchunks;
		}
		/*
		 * Ignore any other values: the metadata entry might have been
		 * created by a future release, with unknown options.
		 */
	}
	WT_ERR_NOTFOUND_OK(ret);

	/*
	 * If the default merge_min was not overridden, calculate it now.  We
	 * do this here so that trees created before merge_min was added get a
	 * sane value.
	 */
	if (lsm_tree->merge_min < 2)
		lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2);

err:	__wt_free(session, lsmconfig);
	return (ret);
}
예제 #22
0
파일: cache_las.c 프로젝트: qihsh/mongo
/*
 * __wt_las_sweep --
 *	Sweep the lookaside table.
 */
int
__wt_las_sweep(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *cursor;
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_RET;
	WT_ITEM *key;
	uint64_t cnt, las_counter, las_txnid;
	uint32_t las_id, session_flags;
	int notused;

	conn = S2C(session);
	cursor = NULL;
	key = &conn->las_sweep_key;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));

	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * If we're not starting a new sweep, position the cursor using the key
	 * from the last call (we don't care if we're before or after the key,
	 * just roughly in the same spot is fine).
	 */
	if (conn->las_sweep_call != 0 && key->data != NULL) {
		__wt_cursor_set_raw_key(cursor, key);
		if ((ret = cursor->search_near(cursor, &notused)) != 0)
			goto srch_notfound;
	}

	/*
	 * The sweep server wakes up every 10 seconds (by default), it's a slow
	 * moving thread. Try to review the entire lookaside table once every 5
	 * minutes, or every 30 calls.
	 *
	 * The reason is because the lookaside table exists because we're seeing
	 * cache/eviction pressure (it allows us to trade performance and disk
	 * space for cache space), and it's likely lookaside blocks are being
	 * evicted, and reading them back in doesn't help things. A trickier,
	 * but possibly better, alternative might be to review all lookaside
	 * blocks in the cache in order to get rid of them, and slowly review
	 * lookaside blocks that have already been evicted.
	 *
	 * We can't know for sure how many records are in the lookaside table,
	 * the cursor insert and remove statistics aren't updated atomically.
	 * Start with reviewing 100 rows, and if it takes more than the target
	 * number of calls to finish, increase the number of rows checked on
	 * each call; if it takes less than the target calls to finish, then
	 * decrease the number of rows reviewed on each call (but never less
	 * than 100).
	 */
#define	WT_SWEEP_LOOKASIDE_MIN_CNT	100
#define	WT_SWEEP_LOOKASIDE_PASS_TARGET	 30
	++conn->las_sweep_call;
	if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT)
		cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT;

	/* Walk the file. */
	for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
		/*
		 * If the loop terminates after completing a work unit, we will
		 * continue the table sweep next time. Get a local copy of the
		 * sweep key, we're going to reset the cursor; do so before
		 * calling cursor.remove, cursor.remove can discard our hazard
		 * pointer and the page could be evicted from underneath us.
		 */
		if (cnt == 1) {
			WT_ERR(__wt_cursor_get_raw_key(cursor, key));
			if (!WT_DATA_IN_ITEM(key))
				WT_ERR(__wt_buf_set(
				    session, key, key->data, key->size));
		}

		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * If the on-page record transaction ID associated with the
		 * record is globally visible, the record can be discarded.
		 *
		 * Cursor opened overwrite=true: won't return WT_NOTFOUND should
		 * another thread remove the record before we do, and the cursor
		 * remains positioned in that case.
		 */
		if (__wt_txn_visible_all(session, las_txnid))
			WT_ERR(cursor->remove(cursor));
	}

	/*
	 * When reaching the lookaside table end or the target number of calls,
	 * adjust the row count. Decrease/increase the row count depending on
	 * if the number of calls is less/more than the target.
	 */
	if (ret == WT_NOTFOUND ||
	    conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) {
		if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET &&
		    conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT)
			conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT;
		if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET)
			conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT;
	}

srch_notfound:
	if (ret == WT_NOTFOUND)
		conn->las_sweep_call = 0;

	WT_ERR_NOTFOUND_OK(ret);

	if (0) {
err:		__wt_buf_free(session, key);
	}

	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));

	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);

	return (ret);
}
예제 #23
0
/*
 * __wt_curds_open --
 *	Initialize a data-source cursor.
 */
int
__wt_curds_open(
    WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
    const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __curds_compare,			/* compare */
	    __wt_cursor_equals,			/* equals */
	    __curds_next,			/* next */
	    __curds_prev,			/* prev */
	    __curds_reset,			/* reset */
	    __curds_search,			/* search */
	    __curds_search_near,		/* search-near */
	    __curds_insert,			/* insert */
	    __wt_cursor_modify_notsup,		/* modify */
	    __curds_update,			/* update */
	    __curds_remove,			/* remove */
	    __curds_reserve,			/* reserve */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __curds_close);			/* close */
	WT_CONFIG_ITEM cval, metadata;
	WT_CURSOR *cursor, *source;
	WT_CURSOR_DATA_SOURCE *data_source;
	WT_DECL_RET;
	char *metaconf;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0);

	data_source = NULL;
	metaconf = NULL;

	WT_RET(__wt_calloc_one(session, &data_source));
	cursor = &data_source->iface;
	*cursor = iface;
	cursor->session = &session->iface;

	/*
	 * XXX
	 * The underlying data-source may require the object's key and value
	 * formats.  This isn't a particularly elegant way of getting that
	 * information to the data-source, this feels like a layering problem
	 * to me.
	 */
	WT_ERR(__wt_metadata_search(session, uri, &metaconf));
	WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format));
	WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval));
	WT_ERR(
	    __wt_strndup(session, cval.str, cval.len, &cursor->value_format));

	WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));

	/* Data-source cursors may have a custom collator. */
	ret = __wt_config_getones(session, metaconf, "collator", &cval);
	if (ret == 0 && cval.len != 0) {
		WT_CLEAR(metadata);
		WT_ERR_NOTFOUND_OK(__wt_config_getones(
		    session, metaconf, "app_metadata", &metadata));
		WT_ERR(__wt_collator_config(session, uri, &cval, &metadata,
		    &data_source->collator, &data_source->collator_owned));
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(dsrc->open_cursor(dsrc,
	    &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source));
	source = data_source->source;
	source->session = (WT_SESSION *)session;
	memset(&source->q, 0, sizeof(source->q));
	source->recno = WT_RECNO_OOB;
	memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
	memset(&source->key, 0, sizeof(source->key));
	memset(&source->value, 0, sizeof(source->value));
	source->saved_err = 0;
	source->flags = 0;

	if (0) {
err:		WT_TRET(__curds_close(cursor));
		*cursorp = NULL;
	}

	__wt_free(session, metaconf);
	return (ret);
}
예제 #24
0
파일: bt_read.c 프로젝트: qihsh/mongo
/*
 * __las_page_instantiate --
 *	Instantiate lookaside update records in a recently read page.
 */
static int
__las_page_instantiate(WT_SESSION_IMPL *session,
    WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size)
{
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE cbt;
	WT_DECL_ITEM(current_key);
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_ITEM(las_value);
	WT_DECL_RET;
	WT_PAGE *page;
	WT_UPDATE *first_upd, *last_upd, *upd;
	size_t incr, total_incr;
	uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
	uint32_t las_id, upd_size, session_flags;
	int exact;
	const uint8_t *p;

	cursor = NULL;
	page = ref->page;
	first_upd = last_upd = upd = NULL;
	total_incr = 0;
	current_recno = recno = WT_RECNO_OOB;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	__wt_btcur_init(session, &cbt);
	__wt_btcur_open(&cbt);

	WT_ERR(__wt_scr_alloc(session, 0, &current_key));
	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));
	WT_ERR(__wt_scr_alloc(session, 0, &las_value));

	/* Open a lookaside table cursor. */
	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * The lookaside records are in key and update order, that is, there
	 * will be a set of in-order updates for a key, then another set of
	 * in-order updates for a subsequent key. We process all of the updates
	 * for a key and then insert those updates into the page, then all the
	 * updates for the next key, and so on.
	 *
	 * Search for the block's unique prefix, stepping through any matching
	 * records.
	 */
	las_addr->data = addr;
	las_addr->size = addr_size;
	las_key->size = 0;
	cursor->set_key(
	    cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
	if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
		ret = cursor->next(cursor);
	for (; ret == 0; ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * Confirm the search using the unique prefix; if not a match,
		 * we're done searching for records for this page.
		 */
		if (las_id != read_id ||
		    las_addr->size != addr_size ||
		    memcmp(las_addr->data, addr, addr_size) != 0)
			break;

		/*
		 * If the on-page value has become globally visible, this record
		 * is no longer needed.
		 */
		if (__wt_txn_visible_all(session, las_txnid))
			continue;

		/* Allocate the WT_UPDATE structure. */
		WT_ERR(cursor->get_value(
		    cursor, &upd_txnid, &upd_size, las_value));
		WT_ERR(__wt_update_alloc(session,
		    (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value,
		    &upd, &incr));
		total_incr += incr;
		upd->txnid = upd_txnid;

		switch (page->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			p = las_key->data;
			WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
			if (current_recno == recno)
				break;
			WT_ASSERT(session, current_recno < recno);

			if (first_upd != NULL) {
				WT_ERR(__col_instantiate(session,
				    current_recno, ref, &cbt, first_upd));
				first_upd = NULL;
			}
			current_recno = recno;
			break;
		case WT_PAGE_ROW_LEAF:
			if (current_key->size == las_key->size &&
			    memcmp(current_key->data,
			    las_key->data, las_key->size) == 0)
				break;

			if (first_upd != NULL) {
				WT_ERR(__row_instantiate(session,
				    current_key, ref, &cbt, first_upd));
				first_upd = NULL;
			}
			WT_ERR(__wt_buf_set(session,
			    current_key, las_key->data, las_key->size));
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}

		/* Append the latest update to the list. */
		if (first_upd == NULL)
			first_upd = last_upd = upd;
		else {
			last_upd->next = upd;
			last_upd = upd;
		}
		upd = NULL;
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* Insert the last set of updates, if any. */
	if (first_upd != NULL)
		switch (page->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			WT_ERR(__col_instantiate(session,
			    current_recno, ref, &cbt, first_upd));
			first_upd = NULL;
			break;
		case WT_PAGE_ROW_LEAF:
			WT_ERR(__row_instantiate(session,
			    current_key, ref, &cbt, first_upd));
			first_upd = NULL;
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}

	/* Discard the cursor. */
	WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));

	if (total_incr != 0) {
		__wt_cache_page_inmem_incr(session, page, total_incr);

		/*
		 * We've modified/dirtied the page, but that's not necessary and
		 * if we keep the page clean, it's easier to evict. We leave the
		 * lookaside table updates in place, so if we evict this page
		 * without dirtying it, any future instantiation of it will find
		 * the records it needs. If the page is dirtied before eviction,
		 * then we'll write any needed lookaside table records for the
		 * new location of the page.
		 */
		__wt_page_modify_clear(session, page);
	}

err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
	WT_TRET(__wt_btcur_close(&cbt, 1));

	/*
	 * On error, upd points to a single unlinked WT_UPDATE structure,
	 * first_upd points to a list.
	 */
	if (upd != NULL)
		__wt_free(session, upd);
	if (first_upd != NULL)
		__wt_free_update_list(session, first_upd);

	__wt_scr_free(session, &current_key);
	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);
	__wt_scr_free(session, &las_value);

	return (ret);
}
예제 #25
0
/*
 * __create_index --
 *	Create an index.
 */
static int
__create_index(WT_SESSION_IMPL *session,
    const char *name, int exclusive, const char *config)
{
	WT_CONFIG kcols, pkcols;
	WT_CONFIG_ITEM ckey, cval, icols, kval;
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_RET;
	WT_ITEM confbuf, extra_cols, fmt, namebuf;
	WT_PACK pack;
	WT_TABLE *table;
	const char *cfg[4] =
	    { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL };
	const char *sourcecfg[] = { config, NULL, NULL };
	const char *source, *sourceconf, *idxname, *tablename;
	char *idxconf;
	size_t tlen;
	int have_extractor;
	u_int i, npublic_cols;

	sourceconf = NULL;
	idxconf = NULL;
	WT_CLEAR(confbuf);
	WT_CLEAR(fmt);
	WT_CLEAR(extra_cols);
	WT_CLEAR(namebuf);
	have_extractor = 0;

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "index:"))
		return (EINVAL);
	idxname = strchr(tablename, ':');
	if (idxname == NULL)
		WT_RET_MSG(session, EINVAL, "Invalid index name, "
		    "should be <table name>:<index name>: %s", name);

	tlen = (size_t)(idxname++ - tablename);
	if ((ret =
	    __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0)
		WT_RET_MSG(session, ret,
		    "Can't create an index for a non-existent table: %.*s",
		    (int)tlen, tablename);

	if (table->is_simple)
		WT_RET_MSG(session, EINVAL,
		    "%s requires a table with named columns", name);

	if (__wt_config_getones(session, config, "source", &cval) == 0) {
		WT_ERR(__wt_buf_fmt(session, &namebuf,
		    "%.*s", (int)cval.len, cval.str));
		source = namebuf.data;
	} else {
		WT_ERR(__wt_schema_index_source(
		    session, table, idxname, config, &namebuf));
		source = namebuf.data;

		/* Add the source name to the index config before collapsing. */
		WT_ERR(__wt_buf_catfmt(session, &confbuf,
		    ",source=\"%s\"", source));
	}

	if (__wt_config_getones_none(
	    session, config, "extractor", &cval) == 0 && cval.len != 0) {
		have_extractor = 1;
		/* Custom extractors must supply a key format. */
		if ((ret = __wt_config_getones(
		    session, config, "key_format", &kval)) != 0)
			WT_ERR_MSG(session, EINVAL,
			    "%s: custom extractors require a key_format", name);
	}

	/* Calculate the key/value formats. */
	WT_CLEAR(icols);
	if (__wt_config_getones(session, config, "columns", &icols) != 0 &&
	    !have_extractor)
		WT_ERR_MSG(session, EINVAL,
		    "%s: requires 'columns' configuration", name);

	/*
	 * Count the public columns using the declared columns for normal
	 * indices or the key format for custom extractors.
	 */
	npublic_cols = 0;
	if (!have_extractor) {
		WT_ERR(__wt_config_subinit(session, &kcols, &icols));
		while ((ret = __wt_config_next(&kcols, &ckey, &cval)) == 0)
			++npublic_cols;
		WT_ERR_NOTFOUND_OK(ret);
	} else {
		WT_ERR(__pack_initn(session, &pack, kval.str, kval.len));
		while ((ret = __pack_next(&pack, &pv)) == 0)
			++npublic_cols;
		WT_ERR_NOTFOUND_OK(ret);
	}

	/*
	 * The key format for an index is somewhat subtle: the application
	 * specifies a set of columns that it will use for the key, but the
	 * engine usually adds some hidden columns in order to derive the
	 * primary key.  These hidden columns are part of the source's
	 * key_format, which we are calculating now, but not part of an index
	 * cursor's key_format.
	 */
	WT_ERR(__wt_config_subinit(session, &pkcols, &table->colconf));
	for (i = 0; i < table->nkey_columns &&
	    (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0;
	    i++) {
		/*
		 * If the primary key column is already in the secondary key,
		 * don't add it again.
		 */
		if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0) {
			if (have_extractor)
				WT_ERR_MSG(session, EINVAL,
				    "an index with a custom extractor may not "
				    "include primary key columns");
			continue;
		}
		WT_ERR(__wt_buf_catfmt(
		    session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str));
	}
	if (ret != 0 && ret != WT_NOTFOUND)
		goto err;

	/* Index values are empty: all columns are packed into the index key. */
	WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format="));

	if (have_extractor) {
		WT_ERR(__wt_buf_catfmt(session, &fmt, "%.*s",
		    (int)kval.len, kval.str));
		WT_CLEAR(icols);
	}

	/*
	 * Construct the index key format, or append the primary key columns
	 * for custom extractors.
	 */
	WT_ERR(__wt_struct_reformat(session, table,
	    icols.str, icols.len, (const char *)extra_cols.data, 0, &fmt));

	/* Check for a record number index key, which makes no sense. */
	WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval));
	if (cval.len == 1 && cval.str[0] == 'r')
		WT_ERR_MSG(session, EINVAL,
		    "column-store index may not use the record number as its "
		    "index key");

	WT_ERR(__wt_buf_catfmt(
	    session, &fmt, ",index_key_columns=%u", npublic_cols));

	sourcecfg[1] = fmt.data;
	WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf));

	WT_ERR(__wt_schema_create(session, source, sourceconf));

	cfg[1] = sourceconf;
	cfg[2] = confbuf.data;
	WT_ERR(__wt_config_collapse(session, cfg, &idxconf));
	if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) {
		/*
		 * If the entry already exists in the metadata, we're done.
		 * This is an error for exclusive creates but okay otherwise.
		 */
		if (ret == WT_DUPLICATE_KEY)
			ret = exclusive ? EEXIST : 0;
		goto err;
	}

	/* Make sure that the configuration is valid. */
	WT_ERR(__wt_schema_open_index(
	    session, table, idxname, strlen(idxname), NULL));

err:	__wt_free(session, idxconf);
	__wt_free(session, sourceconf);
	__wt_buf_free(session, &confbuf);
	__wt_buf_free(session, &extra_cols);
	__wt_buf_free(session, &fmt);
	__wt_buf_free(session, &namebuf);

	__wt_schema_release_table(session, table);
	return (ret);
}
예제 #26
0
/*
 * __create_table --
 *	Create a table.
 */
static int
__create_table(WT_SESSION_IMPL *session,
    const char *name, int exclusive, const char *config)
{
	WT_CONFIG conf;
	WT_CONFIG_ITEM cgkey, cgval, cval;
	WT_DECL_RET;
	WT_TABLE *table;
	const char *cfg[4] =
	    { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL };
	const char *tablename;
	char *tableconf, *cgname;
	size_t cgsize;
	int ncolgroups;

	cgname = NULL;
	table = NULL;
	tableconf = NULL;

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "table:"))
		return (EINVAL);

	if ((ret = __wt_schema_get_table(session,
	    tablename, strlen(tablename), 0, &table)) == 0) {
		__wt_schema_release_table(session, table);
		return (exclusive ? EEXIST : 0);
	}
	WT_RET_NOTFOUND_OK(ret);

	WT_ERR(__wt_config_gets(session, cfg, "colgroups", &cval));
	WT_ERR(__wt_config_subinit(session, &conf, &cval));
	for (ncolgroups = 0;
	    (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0;
	    ncolgroups++)
		;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_config_collapse(session, cfg, &tableconf));
	if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) {
		/*
		 * If the entry already exists in the metadata, we're done.
		 * This is an error for exclusive creates but okay otherwise.
		 */
		if (ret == WT_DUPLICATE_KEY)
			ret = exclusive ? EEXIST : 0;
		goto err;
	}

	/* Attempt to open the table now to catch any errors. */
	WT_ERR(__wt_schema_get_table(
	    session, tablename, strlen(tablename), 1, &table));

	if (ncolgroups == 0) {
		cgsize = strlen("colgroup:") + strlen(tablename) + 1;
		WT_ERR(__wt_calloc_def(session, cgsize, &cgname));
		snprintf(cgname, cgsize, "colgroup:%s", tablename);
		WT_ERR(__create_colgroup(session, cgname, exclusive, config));
	}

	if (0) {
err:		if (table != NULL) {
			WT_TRET(__wt_schema_remove_table(session, table));
			table = NULL;
		}
	}
	if (table != NULL)
		__wt_schema_release_table(session, table);
	__wt_free(session, cgname);
	__wt_free(session, tableconf);
	return (ret);
}
예제 #27
0
/*
 * __schema_open_index --
 *	Open one or more indices for a table (internal version).
 */
static int
__schema_open_index(WT_SESSION_IMPL *session,
    WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
{
	WT_CURSOR *cursor;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_INDEX *idx;
	u_int i;
	int cmp;
	bool match;
	const char *idxconf, *name, *tablename, *uri;

	/* Check if we've already done the work. */
	if (idxname == NULL && table->idx_complete)
		return (0);

	cursor = NULL;
	idx = NULL;
	match = false;

	/* Build a search key. */
	tablename = table->name;
	(void)WT_PREFIX_SKIP(tablename, "table:");
	WT_ERR(__wt_scr_alloc(session, 512, &tmp));
	WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename));

	/* Find matching indices. */
	WT_ERR(__wt_metadata_cursor(session, &cursor));
	cursor->set_key(cursor, tmp->data);
	if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
		ret = cursor->next(cursor);
	for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor, &uri));
		name = uri;
		if (!WT_PREFIX_SKIP(name, tmp->data))
			break;

		/* Is this the index we are looking for? */
		match = idxname == NULL || WT_STRING_MATCH(name, idxname, len);

		/*
		 * Ensure there is space, including if we have to make room for
		 * a new entry in the middle of the list.
		 */
		WT_ERR(__wt_realloc_def(session, &table->idx_alloc,
		    WT_MAX(i, table->nindices) + 1, &table->indices));

		/* Keep the in-memory list in sync with the metadata. */
		cmp = 0;
		while (table->indices[i] != NULL &&
		    (cmp = strcmp(uri, table->indices[i]->name)) > 0) {
			/* Index no longer exists, remove it. */
			__wt_free(session, table->indices[i]);
			memmove(&table->indices[i], &table->indices[i + 1],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[--table->nindices] = NULL;
		}
		if (cmp < 0) {
			/* Make room for a new index. */
			memmove(&table->indices[i + 1], &table->indices[i],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[i] = NULL;
			++table->nindices;
		}

		if (!match)
			continue;

		if (table->indices[i] == NULL) {
			WT_ERR(cursor->get_value(cursor, &idxconf));
			WT_ERR(__wt_calloc_one(session, &idx));
			WT_ERR(__wt_strdup(session, uri, &idx->name));
			WT_ERR(__wt_strdup(session, idxconf, &idx->config));
			WT_ERR(__open_index(session, table, idx));

			/*
			 * If we're checking the creation of an index before a
			 * table is fully created, don't save the index: it
			 * will need to be reopened once the table is complete.
			 */
			if (!table->cg_complete) {
				WT_ERR(
				    __wt_schema_destroy_index(session, &idx));
				if (idxname != NULL)
					break;
				continue;
			}

			table->indices[i] = idx;
			idx = NULL;

			/*
			 * If the slot is bigger than anything else we've seen,
			 * bump the number of indices.
			 */
			if (i >= table->nindices)
				table->nindices = i + 1;
		}

		/* If we were looking for a single index, we're done. */
		if (indexp != NULL)
			*indexp = table->indices[i];
		if (idxname != NULL)
			break;
	}
	WT_ERR_NOTFOUND_OK(ret);
	if (idxname != NULL && !match)
		ret = WT_NOTFOUND;

	/* If we did a full pass, we won't need to do it again. */
	if (idxname == NULL) {
		table->nindices = i;
		table->idx_complete = true;
	}

err:	WT_TRET(__wt_metadata_cursor_release(session, &cursor));
	WT_TRET(__wt_schema_destroy_index(session, &idx));

	__wt_scr_free(session, &tmp);
	return (ret);
}
예제 #28
0
/*
 * __create_table --
 *	Create a table.
 */
static int
__create_table(WT_SESSION_IMPL *session,
    const char *name, bool exclusive, const char *config)
{
	WT_CONFIG conf;
	WT_CONFIG_ITEM cgkey, cgval, cval;
	WT_DECL_RET;
	WT_TABLE *table;
	const char *cfg[4] =
	    { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL };
	const char *tablename;
	char *tableconf, *cgname;
	size_t cgsize;
	int ncolgroups;
	bool exists;

	cgname = NULL;
	table = NULL;
	tableconf = NULL;
	exists = false;

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "table:"))
		return (__wt_unexpected_object_type(session, name, "table:"));

	if ((ret = __wt_schema_get_table(session,
	    tablename, strlen(tablename), false, &table)) == 0) {
		if (exclusive)
			WT_ERR(EEXIST);
		exists = true;
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_config_gets(session, cfg, "colgroups", &cval));
	__wt_config_subinit(session, &conf, &cval);
	for (ncolgroups = 0;
	    (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0;
	    ncolgroups++)
		;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_config_collapse(session, cfg, &tableconf));

	if (!exists) {
		WT_ERR(__wt_metadata_insert(session, name, tableconf));

		/* Attempt to open the table now to catch any errors. */
		WT_ERR(__wt_schema_get_table(
		    session, tablename, strlen(tablename), true, &table));

		if (ncolgroups == 0) {
			cgsize = strlen("colgroup:") + strlen(tablename) + 1;
			WT_ERR(__wt_calloc_def(session, cgsize, &cgname));
			snprintf(cgname, cgsize, "colgroup:%s", tablename);
			WT_ERR(__create_colgroup(
			    session, cgname, exclusive, config));
		}
	}

	if (0) {
err:		if (table != NULL) {
			WT_TRET(__wt_schema_remove_table(session, table));
			table = NULL;
		}
	}
	if (table != NULL)
		__wt_schema_release_table(session, table);
	__wt_free(session, cgname);
	__wt_free(session, tableconf);
	return (ret);
}
예제 #29
0
/*
 * __lsm_bloom_create --
 *	Create a bloom filter for a chunk of the LSM tree that has been
 *	checkpointed but not yet been merged.
 */
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
	WT_BLOOM *bloom;
	WT_CURSOR *src;
	WT_DECL_RET;
	WT_ITEM key;
	uint64_t insert_count;

	WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

	bloom = NULL;
	/*
	 * This is merge-like activity, and we don't want compacts to give up
	 * because we are creating a bunch of bloom filters before merging.
	 */
	++lsm_tree->merge_progressing;
	WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
	    lsm_tree->bloom_config, chunk->count,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));

	/* Open a special merge cursor just on this chunk. */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));

	/*
	 * Setup so that we don't hold pages we read into cache, and so
	 * that we don't get stuck if the cache is full. If we allow
	 * ourselves to get stuck creating bloom filters, the entire tree
	 * can stall since there may be no worker threads available to flush.
	 */
	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		WT_ERR(src->get_key(src, &key));
		WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);
	WT_TRET(src->close(src));

	WT_TRET(__wt_bloom_finalize(bloom));
	WT_ERR(ret);

	F_CLR(session, WT_SESSION_NO_CACHE);

	/* Load the new Bloom filter into cache. */
	WT_CLEAR(key);
	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));

	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
	    "LSM worker created bloom filter %s. "
	    "Expected %" PRIu64 " items, got %" PRIu64,
	    chunk->bloom_uri, chunk->count, insert_count));

	/* Ensure the bloom filter is in the metadata. */
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;
	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));

	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");

err:	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	return (ret);
}
예제 #30
0
파일: cur_join.c 프로젝트: mongodb/mongo
/*
 * __curjoin_init_bloom --
 *	Populate Bloom filters
 */
static int
__curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_CURSOR_JOIN_ENTRY *entry, WT_BLOOM *bloom)
{
	WT_COLLATOR *collator;
	WT_CURSOR *c;
	WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
	WT_DECL_ITEM(uribuf);
	WT_DECL_RET;
	WT_ITEM curkey, curvalue;
	size_t size;
	u_int skip;
	int cmp;
	const char *uri;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };

	c = NULL;
	skip = 0;

	if (entry->index != NULL)
		/*
		 * Open the raw index.  We're avoiding any references
		 * to the main table, they may be expensive.
		 */
		uri = entry->index->source;
	else {
		/*
		 * For joins on the main table, we just need the primary
		 * key for comparison, we don't need any values.
		 */
		size = strlen(cjoin->table->iface.name) + 3;
		WT_ERR(__wt_scr_alloc(session, size, &uribuf));
		WT_ERR(__wt_buf_fmt(session, uribuf, "%s()",
		    cjoin->table->iface.name));
		uri = uribuf->data;
	}
	WT_ERR(__wt_open_cursor(session, uri, &cjoin->iface, raw_cfg, &c));

	/* Initially position the cursor if necessary. */
	endmax = &entry->ends[entry->ends_next];
	if ((end = &entry->ends[0]) < endmax) {
		if (F_ISSET(end, WT_CURJOIN_END_GT) ||
		    WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ) {
			WT_ERR(__wt_cursor_dup_position(end->cursor, c));
			if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE)
				skip = 1;
		} else if (F_ISSET(end, WT_CURJOIN_END_LT)) {
			if ((ret = c->next(c)) == WT_NOTFOUND)
				goto done;
			WT_ERR(ret);
		} else
			WT_PANIC_ERR(session, EINVAL,
			    "fatal error in join cursor position state");
	}
	collator = (entry->index == NULL) ? NULL : entry->index->collator;
	while (ret == 0) {
		WT_ERR(c->get_key(c, &curkey));
		entry->stats.iterated++;
		if (entry->index != NULL) {
			/*
			 * Repack so it's comparable to the
			 * reference endpoints.
			 */
			WT_ERR(__wt_struct_repack(session,
			    c->key_format,
			    (entry->repack_format != NULL ?
			    entry->repack_format : entry->index->idxkey_format),
			    &c->key, &curkey));
		}
		for (end = &entry->ends[skip]; end < endmax; end++) {
			WT_ERR(__wt_compare(session, collator, &curkey,
			    &end->key, &cmp));
			if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) {
				/* if condition satisfied, insert immediately */
				switch (WT_CURJOIN_END_RANGE(end)) {
				case WT_CURJOIN_END_EQ:
					if (cmp == 0)
						goto insert;
					break;
				case WT_CURJOIN_END_GT:
					if (cmp > 0) {
						/* skip this check next time */
						skip = entry->ends_next;
						goto insert;
					}
					break;
				case WT_CURJOIN_END_GE:
					if (cmp >= 0)
						goto insert;
					break;
				case WT_CURJOIN_END_LT:
					if (cmp < 0)
						goto insert;
					break;
				case WT_CURJOIN_END_LE:
					if (cmp <= 0)
						goto insert;
					break;
				}
			} else if (!F_ISSET(end, WT_CURJOIN_END_LT)) {
				if (cmp < 0 || (cmp == 0 &&
				    !F_ISSET(end, WT_CURJOIN_END_EQ)))
					goto advance;
				if (cmp > 0) {
					if (F_ISSET(end, WT_CURJOIN_END_GT))
						skip = 1;
					else
						goto done;
				}
			} else {
				if (cmp > 0 || (cmp == 0 &&
				    !F_ISSET(end, WT_CURJOIN_END_EQ)))
					goto done;
			}
		}
		/*
		 * Either it's a disjunction that hasn't satisfied any
		 * condition, or it's a conjunction that has satisfied all
		 * conditions.
		 */
		if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
			goto advance;
insert:
		if (entry->index != NULL) {
			curvalue.data =
			    (unsigned char *)curkey.data + curkey.size;
			WT_ASSERT(session, c->key.size > curkey.size);
			curvalue.size = c->key.size - curkey.size;
		}
		else
			WT_ERR(c->get_key(c, &curvalue));
		__wt_bloom_insert(bloom, &curvalue);
		entry->stats.bloom_insert++;
advance:
		if ((ret = c->next(c)) == WT_NOTFOUND)
			break;
	}
done:
	WT_ERR_NOTFOUND_OK(ret);

err:	if (c != NULL)
		WT_TRET(c->close(c));
	__wt_scr_free(session, &uribuf);
	return (ret);
}