/* * __create_colgroup -- * Create a column group. */ static int __create_colgroup(WT_SESSION_IMPL *session, const char *name, bool exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_RET; WT_ITEM confbuf, fmt, namebuf; WT_TABLE *table; size_t tlen; const char **cfgp, *cfg[4] = { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *cgname, *source, *sourceconf, *tablename; char *cgconf, *origconf; bool exists; sourceconf = NULL; cgconf = origconf = NULL; WT_CLEAR(fmt); WT_CLEAR(confbuf); WT_CLEAR(namebuf); exists = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "colgroup:")) return ( __wt_unexpected_object_type(session, name, "colgroup:")); cgname = strchr(tablename, ':'); if (cgname != NULL) { tlen = (size_t)(cgname - tablename); ++cgname; } else tlen = strlen(tablename); if ((ret = __wt_schema_get_table(session, tablename, tlen, true, &table)) != 0) WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret, "Can't create '%s' for non-existent table '%.*s'", name, (int)tlen, tablename); /* Make sure the column group is referenced from the table. */ if (cgname != NULL && (ret = __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0) WT_ERR_MSG(session, EINVAL, "Column group '%s' not found in table '%.*s'", cgname, (int)tlen, tablename); /* Check if the column group already exists. */ if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) { if (exclusive) WT_ERR(EEXIST); exists = true; } WT_ERR_NOTFOUND_OK(ret); /* Find the first NULL entry in the cfg stack. */ for (cfgp = &cfg[1]; *cfgp; cfgp++) ; /* Add the source to the colgroup config before collapsing. */ if (__wt_config_getones( session, config, "source", &cval) == 0 && cval.len != 0) { WT_ERR(__wt_buf_fmt( session, &namebuf, "%.*s", (int)cval.len, cval.str)); source = namebuf.data; } else { WT_ERR(__wt_schema_colgroup_source( session, table, cgname, config, &namebuf)); source = namebuf.data; WT_ERR(__wt_buf_fmt( session, &confbuf, "source=\"%s\"", source)); *cfgp++ = confbuf.data; } /* Calculate the key/value formats: these go into the source config. */ WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format)); if (cgname == NULL) WT_ERR(__wt_buf_catfmt (session, &fmt, ",value_format=%s", table->value_format)); else { if (__wt_config_getones(session, config, "columns", &cval) != 0) WT_ERR_MSG(session, EINVAL, "No 'columns' configuration for '%s'", name); WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format=")); WT_ERR(__wt_struct_reformat(session, table, cval.str, cval.len, NULL, true, &fmt)); } sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); WT_ERR(__wt_schema_create(session, source, sourceconf)); WT_ERR(__wt_config_collapse(session, cfg, &cgconf)); if (!exists) { WT_ERR(__wt_metadata_insert(session, name, cgconf)); WT_ERR(__wt_schema_open_colgroups(session, table)); } err: __wt_free(session, cgconf); __wt_free(session, sourceconf); __wt_free(session, origconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); __wt_schema_release_table(session, table); return (ret); }
/* * __create_index -- * Create an index. */ static int __create_index(WT_SESSION_IMPL *session, const char *name, bool exclusive, const char *config) { WT_CONFIG kcols, pkcols; WT_CONFIG_ITEM ckey, cval, icols, kval; WT_DECL_PACK_VALUE(pv); WT_DECL_RET; WT_INDEX *idx; WT_ITEM confbuf, extra_cols, fmt, namebuf; WT_PACK pack; WT_TABLE *table; const char *cfg[4] = { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *source, *sourceconf, *idxname, *tablename; char *idxconf, *origconf; size_t tlen; bool exists, have_extractor; u_int i, npublic_cols; sourceconf = NULL; idxconf = origconf = NULL; WT_CLEAR(confbuf); WT_CLEAR(fmt); WT_CLEAR(extra_cols); WT_CLEAR(namebuf); exists = have_extractor = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "index:")) return (__wt_unexpected_object_type(session, name, "index:")); idxname = strchr(tablename, ':'); if (idxname == NULL) WT_RET_MSG(session, EINVAL, "Invalid index name, " "should be <table name>:<index name>: %s", name); tlen = (size_t)(idxname++ - tablename); if ((ret = __wt_schema_get_table(session, tablename, tlen, true, &table)) != 0) WT_RET_MSG(session, ret, "Can't create an index for a non-existent table: %.*s", (int)tlen, tablename); if (table->is_simple) WT_ERR_MSG(session, EINVAL, "%s requires a table with named columns", name); /* Check if the index already exists. */ if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) { if (exclusive) WT_ERR(EEXIST); exists = true; } WT_ERR_NOTFOUND_OK(ret); if (__wt_config_getones(session, config, "source", &cval) == 0) { WT_ERR(__wt_buf_fmt(session, &namebuf, "%.*s", (int)cval.len, cval.str)); source = namebuf.data; } else { WT_ERR(__wt_schema_index_source( session, table, idxname, config, &namebuf)); source = namebuf.data; /* Add the source name to the index config before collapsing. */ WT_ERR(__wt_buf_catfmt(session, &confbuf, ",source=\"%s\"", source)); } if (__wt_config_getones_none( session, config, "extractor", &cval) == 0 && cval.len != 0) { have_extractor = true; /* Custom extractors must supply a key format. */ if ((ret = __wt_config_getones( session, config, "key_format", &kval)) != 0) WT_ERR_MSG(session, EINVAL, "%s: custom extractors require a key_format", name); } /* Calculate the key/value formats. */ WT_CLEAR(icols); if (__wt_config_getones(session, config, "columns", &icols) != 0 && !have_extractor) WT_ERR_MSG(session, EINVAL, "%s: requires 'columns' configuration", name); /* * Count the public columns using the declared columns for normal * indices or the key format for custom extractors. */ npublic_cols = 0; if (!have_extractor) { __wt_config_subinit(session, &kcols, &icols); while ((ret = __wt_config_next(&kcols, &ckey, &cval)) == 0) ++npublic_cols; WT_ERR_NOTFOUND_OK(ret); } else { WT_ERR(__pack_initn(session, &pack, kval.str, kval.len)); while ((ret = __pack_next(&pack, &pv)) == 0) ++npublic_cols; WT_ERR_NOTFOUND_OK(ret); } /* * The key format for an index is somewhat subtle: the application * specifies a set of columns that it will use for the key, but the * engine usually adds some hidden columns in order to derive the * primary key. These hidden columns are part of the source's * key_format, which we are calculating now, but not part of an index * cursor's key_format. */ __wt_config_subinit(session, &pkcols, &table->colconf); for (i = 0; i < table->nkey_columns && (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0; i++) { /* * If the primary key column is already in the secondary key, * don't add it again. */ if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0) { if (have_extractor) WT_ERR_MSG(session, EINVAL, "an index with a custom extractor may not " "include primary key columns"); continue; } WT_ERR(__wt_buf_catfmt( session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str)); } WT_ERR_NOTFOUND_OK(ret); /* Index values are empty: all columns are packed into the index key. */ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format=")); if (have_extractor) { WT_ERR(__wt_buf_catfmt(session, &fmt, "%.*s", (int)kval.len, kval.str)); WT_CLEAR(icols); } /* * Construct the index key format, or append the primary key columns * for custom extractors. */ WT_ERR(__wt_struct_reformat(session, table, icols.str, icols.len, (const char *)extra_cols.data, false, &fmt)); /* Check for a record number index key, which makes no sense. */ WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval)); if (cval.len == 1 && cval.str[0] == 'r') WT_ERR_MSG(session, EINVAL, "column-store index may not use the record number as its " "index key"); WT_ERR(__wt_buf_catfmt( session, &fmt, ",index_key_columns=%u", npublic_cols)); sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); WT_ERR(__wt_schema_create(session, source, sourceconf)); cfg[1] = sourceconf; cfg[2] = confbuf.data; WT_ERR(__wt_config_collapse(session, cfg, &idxconf)); if (!exists) { WT_ERR(__wt_metadata_insert(session, name, idxconf)); /* Make sure that the configuration is valid. */ WT_ERR(__wt_schema_open_index( session, table, idxname, strlen(idxname), &idx)); /* If there is data in the table, fill the index. */ WT_ERR(__fill_index(session, table, idx)); } err: __wt_free(session, idxconf); __wt_free(session, origconf); __wt_free(session, sourceconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &extra_cols); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); __wt_schema_release_table(session, table); return (ret); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; int needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; /* We need a real session for recovery. */ WT_RET(__wt_open_session(conn, NULL, NULL, &session)); F_SET(session, WT_SESSION_NO_LOGGING); r.session = session; WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = 1; if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; WT_ERR(__wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r)); } } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = 0; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %u/%" PRIuMAX, r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR)) WT_ERR(WT_RUN_RECOVERY); /* * Always run recovery even if it was a clean shutdown. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else WT_ERR(__wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); WT_TRET(session->iface.close(&session->iface, NULL)); return (ret); }
/* * __wt_curds_open -- * Initialize a data-source cursor. */ int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curds_compare, /* compare */ __wt_cursor_equals, /* equals */ __curds_next, /* next */ __curds_prev, /* prev */ __curds_reset, /* reset */ __curds_search, /* search */ __curds_search_near, /* search-near */ __curds_insert, /* insert */ __curds_update, /* update */ __curds_remove, /* remove */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __curds_close); /* close */ WT_CONFIG_ITEM cval, metadata; WT_CURSOR *cursor, *source; WT_CURSOR_DATA_SOURCE *data_source; WT_DECL_RET; char *metaconf; WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0); data_source = NULL; metaconf = NULL; WT_RET(__wt_calloc_one(session, &data_source)); cursor = &data_source->iface; *cursor = iface; cursor->session = &session->iface; /* * XXX * The underlying data-source may require the object's key and value * formats. This isn't a particularly elegant way of getting that * information to the data-source, this feels like a layering problem * to me. */ WT_ERR(__wt_metadata_search(session, uri, &metaconf)); WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format)); WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval)); WT_ERR( __wt_strndup(session, cval.str, cval.len, &cursor->value_format)); WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); /* Data-source cursors may have a custom collator. */ WT_ERR( __wt_config_getones(session, metaconf, "app_metadata", &metadata)); WT_ERR(__wt_config_gets_none(session, cfg, "collator", &cval)); if (cval.len != 0) WT_ERR(__wt_collator_config(session, uri, &cval, &metadata, &data_source->collator, &data_source->collator_owned)); WT_ERR(dsrc->open_cursor(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source)); source = data_source->source; source->session = (WT_SESSION *)session; memset(&source->q, 0, sizeof(source->q)); source->recno = WT_RECNO_OOB; memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf)); memset(&source->key, 0, sizeof(source->key)); memset(&source->value, 0, sizeof(source->value)); source->saved_err = 0; source->flags = 0; if (0) { err: WT_TRET(__curds_close(cursor)); *cursorp = NULL; } __wt_free(session, metaconf); return (ret); }
/* * __create_file -- * Create a new 'file:' object. */ static int __create_file(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const char *config) { WT_DECL_ITEM(val); WT_DECL_RET; const char *filename, **p, *filecfg[] = { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL }; char *fileconf; uint32_t allocsize; bool is_metadata; fileconf = NULL; is_metadata = strcmp(uri, WT_METAFILE_URI) == 0; filename = uri; if (!WT_PREFIX_SKIP(filename, "file:")) WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri); /* Check if the file already exists. */ if (!is_metadata && (ret = __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) { if (exclusive) WT_TRET(EEXIST); goto err; } /* Sanity check the allocation size. */ WT_ERR(__wt_direct_io_size_check( session, filecfg, "allocation_size", &allocsize)); /* Create the file. */ WT_ERR(__wt_block_manager_create(session, filename, allocsize)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); /* * If creating an ordinary file, append the file ID and current version * numbers to the passed-in configuration and insert the resulting * configuration into the metadata. */ if (!is_metadata) { WT_ERR(__wt_scr_alloc(session, 0, &val)); WT_ERR(__wt_buf_fmt(session, val, "id=%" PRIu32 ",version=(major=%d,minor=%d)", ++S2C(session)->next_file_id, WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); for (p = filecfg; *p != NULL; ++p) ; *p = val->data; WT_ERR(__wt_config_collapse(session, filecfg, &fileconf)); WT_ERR(__wt_metadata_insert(session, uri, fileconf)); } /* * Open the file to check that it was setup correctly. We don't need to * pass the configuration, we just wrote the collapsed configuration * into the metadata file, and it's going to be read/used by underlying * functions. * * Keep the handle exclusive until it is released at the end of the * call, otherwise we could race with a drop. */ WT_ERR(__wt_session_get_btree( session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_handle_lock(session, true)); else WT_ERR(__wt_session_release_btree(session)); err: __wt_scr_free(session, &val); __wt_free(session, fileconf); return (ret); }
/* * __wt_schema_open_colgroups -- * Open the column groups for a table. */ int __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table) { WT_COLGROUP *colgroup; WT_CONFIG cparser; WT_CONFIG_ITEM ckey, cval; WT_DECL_RET; WT_DECL_ITEM(buf); char *cgconfig; u_int i; WT_ASSERT(session, F_ISSET(session, WT_SESSION_TABLE_LOCKED)); if (table->cg_complete) return (0); colgroup = NULL; cgconfig = NULL; WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf)); /* Open each column group. */ for (i = 0; i < WT_COLGROUPS(table); i++) { if (table->ncolgroups > 0) WT_ERR(__wt_config_next(&cparser, &ckey, &cval)); else WT_CLEAR(ckey); /* * Always open from scratch: we may have failed part of the way * through opening a table, or column groups may have changed. */ if (table->cgroups[i] != NULL) { __wt_schema_destroy_colgroup( session, table->cgroups[i]); table->cgroups[i] = NULL; } WT_ERR(__wt_buf_init(session, buf, 0)); WT_ERR(__wt_schema_colgroup_name(session, table, ckey.str, ckey.len, buf)); if ((ret = __wt_metadata_search( session, buf->data, &cgconfig)) != 0) { /* It is okay if the table is incomplete. */ if (ret == WT_NOTFOUND) ret = 0; goto err; } WT_ERR(__wt_calloc_def(session, 1, &colgroup)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &colgroup->name)); colgroup->config = cgconfig; cgconfig = NULL; WT_ERR(__wt_config_getones(session, colgroup->config, "columns", &colgroup->colconf)); WT_ERR(__wt_config_getones( session, colgroup->config, "source", &cval)); WT_ERR(__wt_buf_init(session, buf, 0)); WT_ERR(__wt_buf_fmt( session, buf, "%.*s", (int)cval.len, cval.str)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &colgroup->source)); table->cgroups[i] = colgroup; colgroup = NULL; } if (!table->is_simple) { WT_ERR(__wt_table_check(session, table)); WT_ERR(__wt_buf_init(session, buf, 0)); WT_ERR(__wt_struct_plan(session, table, table->colconf.str, table->colconf.len, 1, buf)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &table->plan)); } table->cg_complete = 1; err: __wt_scr_free(&buf); if (colgroup != NULL) __wt_schema_destroy_colgroup(session, colgroup); if (cgconfig != NULL) __wt_free(session, cgconfig); return (ret); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; bool eviction_started, needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); eviction_started = false; was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); /* We need a real session for recovery. */ WT_RET(__wt_open_internal_session(conn, "txn-recover", false, WT_SESSION_NO_LOGGING, &session)); r.session = session; WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = true; /* * If this is a read-only connection, check if the checkpoint * LSN in the metadata file is up to date, indicating a clean * shutdown. */ if (F_ISSET(conn, WT_CONN_READONLY)) { WT_ERR(__wt_log_needs_recovery( session, &metafile->ckpt_lsn, &needs_rec)); if (needs_rec) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); } if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; ret = __wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = false; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %u/%u", r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || F_ISSET(conn, WT_CONN_READONLY))) { if (F_ISSET(conn, WT_CONN_READONLY)) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); WT_ERR(WT_RUN_RECOVERY); } if (F_ISSET(conn, WT_CONN_READONLY)) goto done; /* * Recovery can touch more data than fits in cache, so it relies on * regular eviction to manage paging. Start eviction threads for * recovery without LAS cursors. */ WT_ERR(__wt_evict_create(session)); eviction_started = true; /* * Always run recovery even if it was a clean shutdown only if * this is not a read-only connection. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else { ret = __wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); if (ret != 0) __wt_err(session, ret, "Recovery failed"); /* * Destroy the eviction threads that were started in support of * recovery. They will be restarted once the lookaside table is * created. */ if (eviction_started) WT_TRET(__wt_evict_destroy(session)); WT_TRET(session->iface.close(&session->iface, NULL)); return (ret); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = { WT_CONFIG_BASE(session, session_create), config, NULL }; const char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "collator", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->collator_name)); WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); if (cval.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); if (cval.type == WT_CONFIG_ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); lsm_tree->chunk_max = (uint64_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); lsm_tree->chunk_size = (uint64_t)cval.val; if (lsm_tree->chunk_size > lsm_tree->chunk_max) WT_ERR_MSG(session, EINVAL, "Chunk size (chunk_size) must be smaller than or equal to " "the maximum chunk size (chunk_max)"); WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; lsm_tree->merge_min = lsm_tree->merge_max / 2; WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_threads", &cval)); lsm_tree->merge_threads = (uint32_t)cval.val; /* Sanity check that api_data.py is in sync with lsm.h */ WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS); /* * Set up the config for each chunk. If possible, avoid high latencies * from fsync by flushing the cache every 8MB (will be overridden by * any application setting). */ tmpconfig = ""; #ifdef HAVE_SYNC_FILE_RANGE if (!S2C(session)->direct_io) tmpconfig = "os_cache_dirty_max=8MB,"; #endif WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s%s,key_format=u,value_format=u", tmpconfig, config)); lsm_tree->file_config = __wt_buf_steal(session, buf, NULL); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __wt_curds_create -- * Initialize a data-source cursor. */ int __wt_curds_create(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, NULL, /* get-key */ NULL, /* get-value */ NULL, /* set-key */ NULL, /* set-value */ NULL, /* compare */ __curds_next, /* next */ __curds_prev, /* prev */ __curds_reset, /* reset */ __curds_search, /* search */ __curds_search_near, /* search-near */ __curds_insert, /* insert */ __curds_update, /* update */ __curds_remove, /* remove */ __curds_close); /* close */ WT_CONFIG_ITEM cval; WT_CURSOR *cursor, *dsc; WT_DECL_RET; const char *metaconf; metaconf = NULL; /* Open the WiredTiger cursor. */ WT_RET(__wt_calloc_def(session, 1, &cursor)); *cursor = iface; cursor->session = (WT_SESSION *)session; /* * XXX * We'll need the object's key and value formats. */ WT_ERR(__wt_metadata_search(session, uri, &metaconf)); WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format)); WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval)); WT_ERR( __wt_strndup(session, cval.str, cval.len, &cursor->value_format)); WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); WT_ERR(dsrc->open_cursor(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg, &dsc)); dsc->session = (WT_SESSION *)session; memset(&dsc->q, 0, sizeof(dsc->q)); dsc->recno = 0; memset(dsc->raw_recno_buf, 0, sizeof(dsc->raw_recno_buf)); memset(&dsc->key, 0, sizeof(dsc->key)); memset(&dsc->value, 0, sizeof(dsc->value)); memset(&dsc->saved_err, 0, sizeof(dsc->saved_err)); dsc->data_source = NULL; memset(&dsc->flags, 0, sizeof(dsc->flags)); /* Reference the underlying application cursor. */ cursor->data_source = dsc; if (0) { err: if (F_ISSET(cursor, WT_CURSTD_OPEN)) WT_TRET(cursor->close(cursor)); else __wt_free(session, cursor); } __wt_free(session, metaconf); return (ret); }