/* * __wt_curfile_create -- * Open a cursor for a given btree handle. */ int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { static WT_CURSOR iface = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, __curfile_compare, __curfile_next, __curfile_prev, __curfile_reset, __curfile_search, __curfile_search_near, __curfile_insert, __curfile_update, __curfile_remove, __curfile_close, { NULL, NULL }, /* TAILQ_ENTRY q */ 0, /* recno key */ { 0 }, /* recno raw buffer */ { NULL, 0, 0, NULL, 0 },/* WT_ITEM key */ { NULL, 0, 0, NULL, 0 },/* WT_ITEM value */ 0, /* int saved_err */ 0 /* uint32_t flags */ }; WT_BTREE *btree; WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_CURSOR_BTREE *cbt; WT_DECL_RET; size_t csize; int bitmap, bulk; cbt = NULL; btree = session->btree; WT_ASSERT(session, btree != NULL); WT_RET(__wt_config_gets_defno(session, cfg, "bulk", &cval)); if ((cval.type == ITEM_ID || cval.type == ITEM_STRING) && WT_STRING_MATCH("bitmap", cval.str, cval.len)) bitmap = bulk = 1; else { bitmap = 0; bulk = (cval.val != 0); } csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE); WT_RET(__wt_calloc(session, 1, csize, &cbt)); cursor = &cbt->iface; *cursor = iface; cursor->session = &session->iface; cursor->uri = btree->name; cursor->key_format = btree->key_format; cursor->value_format = btree->value_format; cbt->btree = session->btree; if (bulk) WT_ERR(__wt_curbulk_init((WT_CURSOR_BULK *)cbt, bitmap)); /* * no_cache * No cache cursors are read-only. */ WT_ERR(__wt_config_gets_defno(session, cfg, "no_cache", &cval)); if (cval.val != 0) { cursor->insert = __wt_cursor_notsup; cursor->update = __wt_cursor_notsup; cursor->remove = __wt_cursor_notsup; } /* * random_retrieval * Random retrieval cursors only support next, reset and close. */ WT_ERR(__wt_config_gets_defno(session, cfg, "next_random", &cval)); if (cval.val != 0) { __wt_cursor_set_notsup(cursor); cursor->next = __curfile_next_random; cursor->reset = __curfile_reset; } /* __wt_cursor_init is last so we don't have to clean up on error. */ STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0); WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp)); if (0) { err: __wt_free(session, cbt); } return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM key; uint64_t insert_count; WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); /* * Setup so that we don't hold pages we read into cache, and so * that we don't get stuck if the cache is full. If we allow * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); __wt_bloom_insert(bloom, &key); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); __wt_verbose(session, WT_VERB_LSM, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count); /* Ensure the bloom filter is in the metadata. */ __wt_lsm_tree_writelock(session, lsm_tree); F_SET(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; __wt_lsm_tree_writeunlock(session, lsm_tree); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __wt_lsm_get_chunk_to_flush -- * Find and pin a chunk in the LSM tree that is likely to need flushing. */ int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool force, WT_LSM_CHUNK **chunkp) { WT_DECL_RET; WT_LSM_CHUNK *chunk, *evict_chunk, *flush_chunk; uint32_t i; *chunkp = NULL; chunk = evict_chunk = flush_chunk = NULL; WT_ASSERT(session, lsm_tree->queue_ref > 0); __wt_lsm_tree_readlock(session, lsm_tree); if (!lsm_tree->active || lsm_tree->nchunks == 0) { __wt_lsm_tree_readunlock(session, lsm_tree); return (0); } /* Search for a chunk to evict and/or a chunk to flush. */ for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { /* * Normally we don't want to force out the last chunk. * But if we're doing a forced flush on behalf of a * compact, then we want to include the final chunk. */ if (evict_chunk == NULL && !chunk->evicted && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE)) evict_chunk = chunk; } else if (flush_chunk == NULL && chunk->switch_txn != 0 && (force || i < lsm_tree->nchunks - 1)) flush_chunk = chunk; } /* * Don't be overly zealous about pushing old chunks from cache. * Attempting too many drops can interfere with checkpoints. * * If retrying a discard push an additional work unit so there are * enough to trigger checkpoints. */ if (evict_chunk != NULL && flush_chunk != NULL) { chunk = (__wt_random(&session->rnd) & 1) ? evict_chunk : flush_chunk; WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); } else chunk = (evict_chunk != NULL) ? evict_chunk : flush_chunk; if (chunk != NULL) { __wt_verbose(session, WT_VERB_LSM, "Flush%s: return chunk %" PRIu32 " of %" PRIu32 ": %s", force ? " w/ force" : "", i, lsm_tree->nchunks, chunk->uri); (void)__wt_atomic_add32(&chunk->refcnt, 1); } err: __wt_lsm_tree_readunlock(session, lsm_tree); *chunkp = chunk; return (ret); }
/* * __create_file -- * Create a new 'file:' object. */ static int __create_file(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_DECL_ITEM(val); WT_DECL_RET; uint32_t allocsize; int is_metadata; const char *filename, **p, *filecfg[] = { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL }; char *fileconf; fileconf = NULL; is_metadata = strcmp(uri, WT_METAFILE_URI) == 0; filename = uri; if (!WT_PREFIX_SKIP(filename, "file:")) WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri); /* Check if the file already exists. */ if (!is_metadata && (ret = __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) { if (exclusive) WT_TRET(EEXIST); goto err; } /* Sanity check the allocation size. */ WT_RET(__wt_direct_io_size_check( session, filecfg, "allocation_size", &allocsize)); /* Create the file. */ WT_ERR(__wt_block_manager_create(session, filename, allocsize)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); /* * If creating an ordinary file, append the file ID and current version * numbers to the passed-in configuration and insert the resulting * configuration into the metadata. */ if (!is_metadata) { WT_ERR(__wt_scr_alloc(session, 0, &val)); WT_ERR(__wt_buf_fmt(session, val, "id=%" PRIu32 ",version=(major=%d,minor=%d)", ++S2C(session)->next_file_id, WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); for (p = filecfg; *p != NULL; ++p) ; *p = val->data; WT_ERR(__wt_config_collapse(session, filecfg, &fileconf)); WT_ERR(__wt_metadata_insert(session, uri, fileconf)); } /* * Open the file to check that it was setup correctly. We don't need to * pass the configuration, we just wrote the collapsed configuration * into the metadata file, and it's going to be read/used by underlying * functions. * * Keep the handle exclusive until it is released at the end of the * call, otherwise we could race with a drop. */ WT_ERR(__wt_session_get_btree( session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_handle_lock(session, 1)); else WT_ERR(__wt_session_release_btree(session)); err: __wt_scr_free(session, &val); __wt_free(session, fileconf); return (ret); }
/* * __wt_lsm_work_bloom -- * Try to create a Bloom filter for the newest on-disk chunk that doesn't * have one. */ int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i, merge; WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, false)); /* Create bloom filters in all checkpointed chunks. */ merge = 0; for (i = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; /* * Skip if a thread is still active in the chunk or it * isn't suitable. */ if (!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) || F_ISSET(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) || chunk->generation > 0 || chunk->count == 0) continue; /* Never create a bloom filter on the oldest chunk */ if (chunk == lsm_tree->chunk[0] && !FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) continue; /* * See if we win the race to switch on the "busy" flag and * recheck that the chunk still needs a Bloom filter. */ if (__wt_atomic_cas32(&chunk->bloom_busy, 0, 1)) { if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { ret = __lsm_bloom_create( session, lsm_tree, chunk, (u_int)i); /* * Record if we were successful so that we can * later push a merge work unit. */ if (ret == 0) merge = 1; } chunk->bloom_busy = 0; break; } } /* * If we created any bloom filters, we push a merge work unit now. */ if (merge) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); return (ret); }
/* * __open_verbose -- * Optionally output a verbose message on handle open. */ static inline int __open_verbose(WT_SESSION_IMPL *session, const char *name, uint32_t file_type, uint32_t flags) { #ifdef HAVE_VERBOSE WT_DECL_RET; WT_DECL_ITEM(tmp); const char *file_type_tag, *sep; if (!WT_VERBOSE_ISSET(session, WT_VERB_FILEOPS)) return (0); /* * It's useful to track file opens when debugging platforms, take some * effort to output good tracking information. */ switch (file_type) { case WT_FILE_TYPE_CHECKPOINT: file_type_tag = "checkpoint"; break; case WT_FILE_TYPE_DATA: file_type_tag = "data"; break; case WT_FILE_TYPE_DIRECTORY: file_type_tag = "directory"; break; case WT_FILE_TYPE_LOG: file_type_tag = "log"; break; case WT_FILE_TYPE_REGULAR: file_type_tag = "regular"; break; default: file_type_tag = "unknown open type"; break; } WT_RET(__wt_scr_alloc(session, 0, &tmp)); sep = " ("; #define WT_OPEN_VERBOSE_FLAG(f, name) \ if (LF_ISSET(f)) { \ WT_ERR(__wt_buf_catfmt( \ session, tmp, "%s%s", sep, name)); \ sep = ", "; \ } WT_OPEN_VERBOSE_FLAG(WT_OPEN_CREATE, "create"); WT_OPEN_VERBOSE_FLAG(WT_OPEN_EXCLUSIVE, "exclusive"); WT_OPEN_VERBOSE_FLAG(WT_OPEN_FIXED, "fixed"); WT_OPEN_VERBOSE_FLAG(WT_OPEN_READONLY, "readonly"); WT_OPEN_VERBOSE_FLAG(WT_STREAM_APPEND, "stream-append"); WT_OPEN_VERBOSE_FLAG(WT_STREAM_READ, "stream-read"); WT_OPEN_VERBOSE_FLAG(WT_STREAM_WRITE, "stream-write"); if (tmp->size != 0) WT_ERR(__wt_buf_catfmt(session, tmp, ")")); ret = __wt_verbose(session, WT_VERB_FILEOPS, "%s: handle-open: type %s%s", name, file_type_tag, tmp->size == 0 ? "" : (char *)tmp->data); err: __wt_scr_free(session, &tmp); return (ret); #else WT_UNUSED(session); WT_UNUSED(name); WT_UNUSED(file_type); WT_UNUSED(flags); return (0); #endif }
/* * __create_index -- * Create an index. */ static int __create_index(WT_SESSION_IMPL *session, const char *name, int exclusive, const char *config) { WT_CONFIG kcols, pkcols; WT_CONFIG_ITEM ckey, cval, icols, kval; WT_DECL_PACK_VALUE(pv); WT_DECL_RET; WT_ITEM confbuf, extra_cols, fmt, namebuf; WT_PACK pack; WT_TABLE *table; const char *cfg[4] = { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *source, *sourceconf, *idxname, *tablename; char *idxconf; size_t tlen; int have_extractor; u_int i, npublic_cols; sourceconf = NULL; idxconf = NULL; WT_CLEAR(confbuf); WT_CLEAR(fmt); WT_CLEAR(extra_cols); WT_CLEAR(namebuf); have_extractor = 0; tablename = name; if (!WT_PREFIX_SKIP(tablename, "index:")) return (EINVAL); idxname = strchr(tablename, ':'); if (idxname == NULL) WT_RET_MSG(session, EINVAL, "Invalid index name, " "should be <table name>:<index name>: %s", name); tlen = (size_t)(idxname++ - tablename); if ((ret = __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0) WT_RET_MSG(session, ret, "Can't create an index for a non-existent table: %.*s", (int)tlen, tablename); if (table->is_simple) WT_RET_MSG(session, EINVAL, "%s requires a table with named columns", name); if (__wt_config_getones(session, config, "source", &cval) == 0) { WT_ERR(__wt_buf_fmt(session, &namebuf, "%.*s", (int)cval.len, cval.str)); source = namebuf.data; } else { WT_ERR(__wt_schema_index_source( session, table, idxname, config, &namebuf)); source = namebuf.data; /* Add the source name to the index config before collapsing. */ WT_ERR(__wt_buf_catfmt(session, &confbuf, ",source=\"%s\"", source)); } if (__wt_config_getones_none( session, config, "extractor", &cval) == 0 && cval.len != 0) { have_extractor = 1; /* Custom extractors must supply a key format. */ if ((ret = __wt_config_getones( session, config, "key_format", &kval)) != 0) WT_ERR_MSG(session, EINVAL, "%s: custom extractors require a key_format", name); } /* Calculate the key/value formats. */ WT_CLEAR(icols); if (__wt_config_getones(session, config, "columns", &icols) != 0 && !have_extractor) WT_ERR_MSG(session, EINVAL, "%s: requires 'columns' configuration", name); /* * Count the public columns using the declared columns for normal * indices or the key format for custom extractors. */ npublic_cols = 0; if (!have_extractor) { WT_ERR(__wt_config_subinit(session, &kcols, &icols)); while ((ret = __wt_config_next(&kcols, &ckey, &cval)) == 0) ++npublic_cols; WT_ERR_NOTFOUND_OK(ret); } else { WT_ERR(__pack_initn(session, &pack, kval.str, kval.len)); while ((ret = __pack_next(&pack, &pv)) == 0) ++npublic_cols; WT_ERR_NOTFOUND_OK(ret); } /* * The key format for an index is somewhat subtle: the application * specifies a set of columns that it will use for the key, but the * engine usually adds some hidden columns in order to derive the * primary key. These hidden columns are part of the source's * key_format, which we are calculating now, but not part of an index * cursor's key_format. */ WT_ERR(__wt_config_subinit(session, &pkcols, &table->colconf)); for (i = 0; i < table->nkey_columns && (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0; i++) { /* * If the primary key column is already in the secondary key, * don't add it again. */ if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0) { if (have_extractor) WT_ERR_MSG(session, EINVAL, "an index with a custom extractor may not " "include primary key columns"); continue; } WT_ERR(__wt_buf_catfmt( session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str)); } if (ret != 0 && ret != WT_NOTFOUND) goto err; /* Index values are empty: all columns are packed into the index key. */ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format=")); if (have_extractor) { WT_ERR(__wt_buf_catfmt(session, &fmt, "%.*s", (int)kval.len, kval.str)); WT_CLEAR(icols); } /* * Construct the index key format, or append the primary key columns * for custom extractors. */ WT_ERR(__wt_struct_reformat(session, table, icols.str, icols.len, (const char *)extra_cols.data, 0, &fmt)); /* Check for a record number index key, which makes no sense. */ WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval)); if (cval.len == 1 && cval.str[0] == 'r') WT_ERR_MSG(session, EINVAL, "column-store index may not use the record number as its " "index key"); WT_ERR(__wt_buf_catfmt( session, &fmt, ",index_key_columns=%u", npublic_cols)); sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); WT_ERR(__wt_schema_create(session, source, sourceconf)); cfg[1] = sourceconf; cfg[2] = confbuf.data; WT_ERR(__wt_config_collapse(session, cfg, &idxconf)); if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) { /* * If the entry already exists in the metadata, we're done. * This is an error for exclusive creates but okay otherwise. */ if (ret == WT_DUPLICATE_KEY) ret = exclusive ? EEXIST : 0; goto err; } /* Make sure that the configuration is valid. */ WT_ERR(__wt_schema_open_index( session, table, idxname, strlen(idxname), NULL)); err: __wt_free(session, idxconf); __wt_free(session, sourceconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &extra_cols); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); __wt_schema_release_table(session, table); return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM buf, key; WT_SESSION *wt_session; uint64_t insert_count; int exist; /* * Normally, the Bloom URI is populated when the chunk struct is * allocated. After an open, however, it may not have been. * Deal with that here. */ if (chunk->bloom_uri == NULL) { WT_CLEAR(buf); WT_RET(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &buf)); chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL); } /* * Drop the bloom filter first - there may be some content hanging over * from an aborted merge or checkpoint. */ wt_session = &session->iface; WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist)); if (exist) WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force")); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); F_SET(session, WT_SESSION_NO_CACHE); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); WT_VERBOSE_ERR(session, lsm, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count); /* Ensure the bloom filter is in the metadata. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE); return (ret); }
/* * __lsm_free_chunks -- * Try to drop chunks from the tree that are no longer required. */ static int __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i, skipped; int progress; /* * Take a copy of the current state of the LSM tree and look for chunks * to drop. We do it this way to avoid holding the LSM tree lock while * doing I/O or waiting on the schema lock. * * This is safe because only one thread will be in this function at a * time (the first merge thread). Merges may complete concurrently, * and the old_chunks array may be extended, but we shuffle down the * pointers each time we free one to keep the non-NULL slots at the * beginning of the array. */ WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1)); for (i = skipped = 0, progress = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; WT_ASSERT(session, chunk != NULL); /* Skip the chunk if another worker is using it. */ if (chunk->refcnt > 1) { ++skipped; continue; } if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) { /* * An EBUSY return is acceptable - a cursor may still * be positioned on this old chunk. */ if ((ret = __lsm_drop_file( session, chunk->bloom_uri)) == EBUSY) { WT_VERBOSE_ERR(session, lsm, "LSM worker bloom drop busy: %s.", chunk->bloom_uri); ++skipped; continue; } else WT_ERR(ret); F_CLR_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); } if (chunk->uri != NULL) { /* * An EBUSY return is acceptable - a cursor may still * be positioned on this old chunk. */ if ((ret = __lsm_drop_file( session, chunk->uri)) == EBUSY) { WT_VERBOSE_ERR(session, lsm, "LSM worker drop busy: %s.", chunk->uri); ++skipped; continue; } else WT_ERR(ret); } progress = 1; /* Lock the tree to clear out the old chunk information. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); /* * The chunk we are looking at should be the first one in the * tree that we haven't already skipped over. */ WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, lsm_tree->old_chunks[skipped]); /* Shuffle down to keep all occupied slots at the beginning. */ if (--lsm_tree->nold_chunks > skipped) { memmove(lsm_tree->old_chunks + skipped, lsm_tree->old_chunks + skipped + 1, (lsm_tree->nold_chunks - skipped) * sizeof(WT_LSM_CHUNK *)); lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL; } /* * Clear the chunk in the cookie so we don't attempt to * decrement the reference count. */ cookie.chunk_array[i] = NULL; /* * Update the metadata. We used to try to optimize by only * updating the metadata once at the end, but the error * handling is not straightforward. */ WT_TRET(__wt_lsm_meta_write(session, lsm_tree)); WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* Returning non-zero means there is no work to do. */ if (!progress) WT_TRET(WT_NOTFOUND); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int skipped, newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, 0)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (skipped = newpage = 0;; skipped = 0, newpage = 1) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev( cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage, &skipped); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } if (newpage && skipped) page->read_gen = WT_READGEN_OLDEST; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_lsm_checkpoint_worker -- * A worker thread for an LSM tree, responsible for flushing new chunks to * disk. */ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; u_int i, j; int locked; lsm_tree = arg; session = lsm_tree->ckpt_session; WT_CLEAR(cookie); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Write checkpoints in all completed files. */ for (i = 0, j = 0; i < cookie.nchunks - 1; i++) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) goto err; if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; chunk = cookie.chunk_array[i]; /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (!__wt_txn_visible_all(session, chunk->txnid_max)) break; /* * If the chunk is already checkpointed, make sure it * is also evicted. Either way, there is no point * trying to checkpoint it again. */ if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) { if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED)) continue; if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) F_SET_ATOMIC( chunk, WT_LSM_CHUNK_EVICTED); else if (ret == EBUSY) ret = 0; else WT_ERR_MSG(session, ret, "discard handle"); continue; } WT_VERBOSE_ERR(session, lsm, "LSM worker flushing %u", i); /* * Flush the file before checkpointing: this is the * expensive part in terms of I/O: do it without * holding the schema lock. * * Use the special eviction isolation level to avoid * interfering with an application checkpoint: we have * already checked that all of the updates in this * chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to * complete, which can be a long time. * * Don't keep waiting for the lock if application * threads are waiting for a switch. Don't skip * flushing the leaves either: that just means we'll * hold the schema lock for (much) longer, which blocks * the world. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); for (locked = 0; !locked && ret == 0 && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) { if ((ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; else if (ret == EBUSY) { __wt_yield(); ret = 0; } } if (locked) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_bt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); } WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointing %u", i); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint"); break; } WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* * Clear the "cache resident" flag so the primary can * be evicted and eventually closed. Only do this once * the checkpoint has succeeded: otherwise, accessing * the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_ERR(__wt_session_release_btree(session)); ++j; WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint metadata write"); break; } WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointed %u", i); } __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* * The thread will only exit with failure if we run out of memory or * there is some other system driven failure. We can't keep going * after such a failure - ensure WiredTiger shuts down. */ if (ret != 0 && ret != WT_NOTFOUND) WT_PANIC_ERR(session, ret, "Shutting down LSM checkpoint utility thread"); return (NULL); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; int needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; /* We need a real session for recovery. */ WT_RET(__wt_open_session(conn, NULL, NULL, &session)); F_SET(session, WT_SESSION_NO_LOGGING); r.session = session; WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = 1; if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; WT_ERR(__wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r)); } } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = 0; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %u/%" PRIuMAX, r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR)) WT_ERR(WT_RUN_RECOVERY); /* * Always run recovery even if it was a clean shutdown. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else WT_ERR(__wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); WT_TRET(session->iface.close(&session->iface, NULL)); return (ret); }
/* * __txn_op_apply -- * Apply a transactional operation during recovery. */ static int __txn_op_apply( WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) { WT_CURSOR *cursor, *start, *stop; WT_DECL_RET; WT_ITEM key, start_key, stop_key, value; WT_SESSION_IMPL *session; uint64_t recno, start_recno, stop_recno; uint32_t fileid, mode, optype, opsize; session = r->session; cursor = NULL; /* Peek at the size and the type. */ WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize)); end = *pp + opsize; switch (optype) { case WT_LOGOP_COL_PUT: WT_ERR(__wt_logop_col_put_unpack(session, pp, end, &fileid, &recno, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); __wt_cursor_set_raw_value(cursor, &value); WT_ERR(cursor->insert(cursor)); break; case WT_LOGOP_COL_REMOVE: WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, &fileid, &recno)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); WT_ERR(cursor->remove(cursor)); break; case WT_LOGOP_COL_TRUNCATE: WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end, &fileid, &start_recno, &stop_recno)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ if (start_recno == 0) { start = NULL; stop = cursor; } else if (stop_recno == 0) { start = cursor; stop = NULL; } else { start = cursor; WT_ERR(__recovery_cursor( session, r, lsnp, fileid, 1, &stop)); } /* Set the keys. */ if (start != NULL) start->set_key(start, start_recno); if (stop != NULL) stop->set_key(stop, stop_recno); WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); /* If we opened a duplicate cursor, close it now. */ if (stop != NULL && stop != cursor) WT_TRET(stop->close(stop)); WT_ERR(ret); break; case WT_LOGOP_ROW_PUT: WT_ERR(__wt_logop_row_put_unpack(session, pp, end, &fileid, &key, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); __wt_cursor_set_raw_value(cursor, &value); WT_ERR(cursor->insert(cursor)); break; case WT_LOGOP_ROW_REMOVE: WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, &fileid, &key)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); WT_ERR(cursor->remove(cursor)); break; case WT_LOGOP_ROW_TRUNCATE: WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end, &fileid, &start_key, &stop_key, &mode)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ start = stop = NULL; switch (mode) { case TXN_TRUNC_ALL: /* Both cursors stay NULL. */ break; case TXN_TRUNC_BOTH: start = cursor; WT_ERR(__recovery_cursor( session, r, lsnp, fileid, 1, &stop)); break; case TXN_TRUNC_START: start = cursor; break; case TXN_TRUNC_STOP: stop = cursor; break; WT_ILLEGAL_VALUE_ERR(session); } /* Set the keys. */ if (start != NULL) __wt_cursor_set_raw_key(start, &start_key); if (stop != NULL) __wt_cursor_set_raw_key(stop, &stop_key); WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); /* If we opened a duplicate cursor, close it now. */ if (stop != NULL && stop != cursor) WT_TRET(stop->close(stop)); WT_ERR(ret); break; WT_ILLEGAL_VALUE_ERR(session); } /* Reset the cursor so it doesn't block eviction. */ if (cursor != NULL) WT_ERR(cursor->reset(cursor)); r->modified = 1; err: if (ret != 0) __wt_err(session, ret, "Operation failed during recovery"); return (ret); }
/* * __drop_table -- * WT_SESSION::drop for a table. */ static int __drop_table( WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) { WT_COLGROUP *colgroup; WT_DECL_RET; WT_INDEX *idx; WT_TABLE *table; u_int i; const char *name; bool tracked; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)); name = uri; WT_PREFIX_SKIP_REQUIRED(session, name, "table:"); table = NULL; tracked = false; /* * Open the table so we can drop its column groups and indexes. * * Ideally we would keep the table locked exclusive across the drop, * but for now we rely on the global table lock to prevent the table * being reopened while it is being dropped. One issue is that the * WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock, * avoiding deadlocks while waiting for LSM operation to quiesce. * * Temporarily getting the table exclusively serves the purpose * of ensuring that cursors on the table that are already open * must at least be closed before this call proceeds. */ WT_ERR(__wt_schema_get_table_uri(session, uri, true, WT_DHANDLE_EXCLUSIVE, &table)); WT_ERR(__wt_schema_release_table(session, table)); WT_ERR(__wt_schema_get_table_uri(session, uri, true, 0, &table)); /* Drop the column groups. */ for (i = 0; i < WT_COLGROUPS(table); i++) { if ((colgroup = table->cgroups[i]) == NULL) continue; /* * Drop the column group before updating the metadata to avoid * the metadata for the table becoming inconsistent if we can't * get exclusive access. */ WT_ERR(__wt_schema_drop(session, colgroup->source, cfg)); WT_ERR(__wt_metadata_remove(session, colgroup->name)); } /* Drop the indices. */ WT_ERR(__wt_schema_open_indices(session, table)); for (i = 0; i < table->nindices; i++) { if ((idx = table->indices[i]) == NULL) continue; /* * Drop the index before updating the metadata to avoid * the metadata for the table becoming inconsistent if we can't * get exclusive access. */ WT_ERR(__wt_schema_drop(session, idx->source, cfg)); WT_ERR(__wt_metadata_remove(session, idx->name)); } /* Make sure the table data handle is closed. */ WT_TRET(__wt_schema_release_table(session, table)); WT_ERR(__wt_schema_get_table_uri( session, uri, true, WT_DHANDLE_EXCLUSIVE, &table)); F_SET(&table->iface, WT_DHANDLE_DISCARD); if (WT_META_TRACKING(session)) { WT_WITH_DHANDLE(session, &table->iface, ret = __wt_meta_track_handle_lock(session, false)); WT_ERR(ret); tracked = true; } /* Remove the metadata entry (ignore missing items). */ WT_ERR(__wt_metadata_remove(session, uri)); err: if (table != NULL && !tracked) WT_TRET(__wt_schema_release_table(session, table)); return (ret); }
/* * __page_read -- * Read a page from the file. */ static int __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { const WT_PAGE_HEADER *dsk; WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; size_t addr_size; uint32_t previous_state; const uint8_t *addr; btree = S2BT(session); page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; goto done; } /* * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* * Clear the local reference to an allocated copy of the disk image on * return; the page steals it, errors in this code should not free it. */ tmp.mem = NULL; /* * If reading for a checkpoint, there's no additional work to do, the * page on disk is correct as written. */ if (session->dhandle->checkpoint != NULL) goto done; /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); /* * Instantiate updates from the database's lookaside table. The page * flag was set when the page was written, potentially a long time ago. * We only care if the lookaside table is currently active, check that * before doing any work. */ dsk = tmp.data; if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); WT_ERR(__las_page_instantiate( session, ref, btree->id, addr, addr_size)); } done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __wt_lsm_merge_worker -- * The merge worker thread for an LSM tree, responsible for merging * on-disk trees. */ void * __wt_lsm_merge_worker(void *vargs) { WT_DECL_RET; WT_LSM_WORKER_ARGS *args; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; u_int aggressive, chunk_wait, id, old_aggressive, stallms; int progress; args = vargs; lsm_tree = args->lsm_tree; id = args->id; session = lsm_tree->worker_sessions[id]; __wt_free(session, args); aggressive = stallms = 0; while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { /* * Help out with switching chunks in case the checkpoint worker * is busy. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } progress = 0; /* Clear any state from previous worker thread iterations. */ session->dhandle = NULL; /* Try to create a Bloom filter. */ if (__lsm_bloom_work(session, lsm_tree) == 0) progress = 1; /* If we didn't create a Bloom filter, try to merge. */ if (progress == 0 && __wt_lsm_merge(session, lsm_tree, id, aggressive) == 0) progress = 1; /* Clear any state from previous worker thread iterations. */ WT_CLEAR_BTREE_IN_SESSION(session); /* * Only have one thread freeing old chunks, and only if there * are chunks to free. */ if (id == 0 && lsm_tree->nold_chunks > 0 && __lsm_free_chunks(session, lsm_tree) == 0) progress = 1; if (progress) stallms = 0; else if (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { /* Poll 10 times per second. */ WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); stallms += 100; /* * Get aggressive if more than enough chunks for a * merge should have been created while we waited. * Use 10 seconds as a default if we don't have an * estimate. */ chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ? 10000 : lsm_tree->chunk_fill_ms); old_aggressive = aggressive; aggressive = chunk_wait / lsm_tree->merge_min; if (aggressive > old_aggressive) WT_VERBOSE_ERR(session, lsm, "LSM merge got aggressive (%u), " "%u / %" PRIu64, aggressive, stallms, lsm_tree->chunk_fill_ms); } } if (0) { err: __wt_err(session, ret, "LSM merge worker failed"); } return (NULL); }
/* * __wt_open -- * Open a file handle. */ int __wt_open(WT_SESSION_IMPL *session, const char *name, uint32_t file_type, uint32_t flags, WT_FH **fhp) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh; bool lock_file, open_called; char *path; WT_ASSERT(session, file_type != 0); /* A file type is required. */ conn = S2C(session); fh = NULL; open_called = false; path = NULL; WT_RET(__open_verbose(session, name, file_type, flags)); /* Check if the handle is already open. */ if (__wt_handle_search(session, name, true, NULL, &fh)) { /* * XXX * The in-memory implementation has to reset the file offset * when a file is re-opened (which obviously also depends on * in-memory configurations never opening a file in more than * one thread at a time). This needs to be fixed. */ if (F_ISSET(fh, WT_FH_IN_MEMORY) && fh->ref == 1) fh->off = 0; *fhp = fh; return (0); } /* Allocate a structure and set the name. */ WT_ERR(__wt_calloc_one(session, &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); /* * If this is a read-only connection, open all files read-only except * the lock file. * * The only file created in read-only mode is the lock file. */ if (F_ISSET(conn, WT_CONN_READONLY)) { lock_file = strcmp(name, WT_SINGLETHREAD) == 0; if (!lock_file) LF_SET(WT_OPEN_READONLY); WT_ASSERT(session, lock_file || !LF_ISSET(WT_OPEN_CREATE)); } /* Create the path to the file. */ if (!LF_ISSET(WT_OPEN_FIXED)) WT_ERR(__wt_filename(session, name, &path)); /* Call the underlying open function. */ WT_ERR(conn->handle_open( session, fh, path == NULL ? name : path, file_type, flags)); open_called = true; /* * Repeat the check for a match: if there's no match, link our newly * created handle onto the database's list of files. */ if (__wt_handle_search(session, name, true, fh, fhp)) { err: if (open_called) WT_TRET(fh->fh_close(session, fh)); if (fh != NULL) { __wt_free(session, fh->name); __wt_free(session, fh); } } __wt_free(session, path); return (ret); }
/* * __snapshot_process -- * Process the list of snapshots. */ static int __snapshot_process( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) { WT_BLOCK_SNAPSHOT *a, *b, *si; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_SNAPSHOT *snap; uint64_t snapshot_size; int deleting, locked; si = &block->live; locked = 0; /* * We've allocated our last page, update the snapshot size. We need to * calculate the live system's snapshot size before reading and merging * snapshot allocation and discard information from the snapshots we're * deleting, those operations will change the underlying byte counts. */ snapshot_size = si->snapshot_size; snapshot_size += si->alloc.bytes; snapshot_size -= si->discard.bytes; /* * Extents that become newly available as a result of deleting previous * snapshots are added to a list of extents. The list should be empty, * but there's no explicit "free the snapshot information" call into the * block manager; if there was an error in an upper level resulting in * the snapshot never being "resolved", the list might not be empty. * * XXX * This isn't sufficient, actually: we're going to leak all the blocks * that were written as part of the last snapshot because it was never * resolved. */ __wt_block_extlist_free(session, &si->snapshot_avail); WT_RET(__wt_block_extlist_init( session, &si->snapshot_avail, "live", "snapshot_avail")); /* * To delete a snapshot, we'll need snapshot information for it, and we * have to read that from the disk. */ deleting = 0; WT_SNAPSHOT_FOREACH(snapbase, snap) { /* * To delete a snapshot, we'll need snapshot information for it * and the subsequent snapshot. The test is tricky, we have to * load the current snapshot's information if it's marked for * deletion, or if it follows a snapshot marked for deletion, * where the boundary cases are the first snapshot in the list * and the last snapshot in the list: if we're deleting the last * snapshot in the list, there's no next snapshot, the snapshot * will be merged into the live tree. */ if (!F_ISSET(snap, WT_SNAP_DELETE) && (snap == snapbase || F_ISSET(snap, WT_SNAP_ADD) || !F_ISSET(snap - 1, WT_SNAP_DELETE))) continue; deleting = 1; /* * Allocate a snapshot structure, crack the cookie and read the * snapshot's extent lists. * * Ignore the avail list: snapshot avail lists are only useful * if we are rolling forward from the particular snapshot and * they represent our best understanding of what blocks can be * allocated. If we are not operating on the live snapshot, * subsequent snapshots might have allocated those blocks, and * the avail list is useless. We don't discard it, because it * is useful as part of verification, but we don't re-write it * either. */ WT_ERR(__wt_calloc( session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv)); si = snap->bpriv; WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0)); WT_ERR(__wt_block_buffer_to_snapshot( session, block, snap->raw.data, si)); WT_ERR(__wt_block_extlist_read(session, block, &si->alloc)); WT_ERR(__wt_block_extlist_read(session, block, &si->discard)); } /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if snapshots take too * much time away from real work: we read historic snapshot information * without a lock, but we could also merge and re-write the delete * snapshot information without a lock, except for ranges merged into * the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = 1; /* Skip the additional processing if we aren't deleting snapshots. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed snapshots: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_SNAPSHOT_FOREACH(snapbase, snap) { if (!F_ISSET(snap, WT_SNAP_DELETE)) continue; if (WT_VERBOSE_ISSET(session, snapshot)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string( session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: delete-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } /* * Set the from/to snapshot structures, where the "to" value * may be the live tree. */ a = snap->bpriv; if (F_ISSET(snap + 1, WT_SNAP_ADD)) b = &block->live; else b = (snap + 1)->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the snapshot's discard list, however, not the live system's * list because it appears on the snapshot's alloc list and so * must be paired in the snapshot. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" snapshot's extent * lists directly to the live system's avail list, they were * never on any alloc list. Include the "from" snapshot's * avail list, it's going away. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * snapshot's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->discard, &b->discard)); /* * If the "to" snapshot is also being deleted, we're done with * it, it's merged into some other snapshot in the next loop. * This means the extent lists may aggregate over a number of * snapshots, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(snap + 1, WT_SNAP_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" snapshot's allocate * and discard lists overlap is fair game, move ranges appearing * on both lists to the live snapshot's newly available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(snap + 1, WT_SNAP_ADD)) continue; /* * We have to write the "to" snapshot's extent lists out in new * blocks, and update its cookie. * * Free the blocks used to hold the "to" snapshot's extent lists * directly to the live system's avail list, they were never on * any alloc list. Do not include the "to" snapshot's avail * list, it's not changing. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard)); F_SET(snap + 1, WT_SNAP_UPDATE); } /* Update snapshots marked for update. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_UPDATE)) { WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD)); WT_ERR(__snapshot_update( session, block, snap, snap->bpriv, 0, 0)); } live_update: si = &block->live; /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail)); /* Update the final, added snapshot based on the live system. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_ADD)) { WT_ERR(__snapshot_update( session, block, snap, si, snapshot_size, 1)); /* * XXX * Our caller wants two pieces of information: the time * the snapshot was taken and the final snapshot size. * This violates layering but the alternative is a call * for the btree layer to crack the snapshot cookie into * its components, and that's a fair amount of work. * (We could just read the system time in the session * layer when updating the metadata file, but that won't * work for the snapshot size, and so we do both here.) */ snap->snapshot_size = si->snapshot_size; WT_ERR(__wt_epoch(session, &snap->sec, NULL)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. */ __wt_block_extlist_free(session, &si->alloc); WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc")); __wt_block_extlist_free(session, &si->discard); WT_ERR( __wt_block_extlist_init(session, &si->discard, "live", "discard")); #ifdef HAVE_DIAGNOSTIC /* * The first snapshot in the system should always have an empty discard * list. If we've read that snapshot and/or created it, check. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (!F_ISSET(snap, WT_SNAP_DELETE)) break; if ((a = snap->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) { __wt_errx(session, "snapshot incorrectly has blocks on the discard list"); WT_ERR(WT_ERROR); } #endif err: if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any snapshot information we loaded, we no longer need it. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if ((si = snap->bpriv) != NULL) { __wt_block_extlist_free(session, &si->alloc); __wt_block_extlist_free(session, &si->avail); __wt_block_extlist_free(session, &si->discard); } __wt_scr_free(&tmp); return (ret); }
/* * __create_colgroup -- * Create a column group. */ static int __create_colgroup(WT_SESSION_IMPL *session, const char *name, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_RET; WT_ITEM confbuf, fmt, namebuf; WT_TABLE *table; size_t tlen; const char **cfgp, *cfg[4] = { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *cgname, *source, *sourceconf, *tablename; char *cgconf, *oldconf; sourceconf = NULL; cgconf = oldconf = NULL; WT_CLEAR(fmt); WT_CLEAR(confbuf); WT_CLEAR(namebuf); tablename = name; if (!WT_PREFIX_SKIP(tablename, "colgroup:")) return (EINVAL); cgname = strchr(tablename, ':'); if (cgname != NULL) { tlen = (size_t)(cgname - tablename); ++cgname; } else tlen = strlen(tablename); if ((ret = __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0) WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret, "Can't create '%s' for non-existent table '%.*s'", name, (int)tlen, tablename); /* Make sure the column group is referenced from the table. */ if (cgname != NULL && (ret = __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0) WT_ERR_MSG(session, EINVAL, "Column group '%s' not found in table '%.*s'", cgname, (int)tlen, tablename); /* Find the first NULL entry in the cfg stack. */ for (cfgp = &cfg[1]; *cfgp; cfgp++) ; /* Add the source to the colgroup config before collapsing. */ if (__wt_config_getones( session, config, "source", &cval) == 0 && cval.len != 0) { WT_ERR(__wt_buf_fmt( session, &namebuf, "%.*s", (int)cval.len, cval.str)); source = namebuf.data; } else { WT_ERR(__wt_schema_colgroup_source( session, table, cgname, config, &namebuf)); source = namebuf.data; WT_ERR(__wt_buf_fmt( session, &confbuf, "source=\"%s\"", source)); *cfgp++ = confbuf.data; } /* Calculate the key/value formats: these go into the source config. */ WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format)); if (cgname == NULL) WT_ERR(__wt_buf_catfmt (session, &fmt, ",value_format=%s", table->value_format)); else { if (__wt_config_getones(session, config, "columns", &cval) != 0) WT_ERR_MSG(session, EINVAL, "No 'columns' configuration for '%s'", name); WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format=")); WT_ERR(__wt_struct_reformat(session, table, cval.str, cval.len, NULL, 1, &fmt)); } sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); WT_ERR(__wt_schema_create(session, source, sourceconf)); WT_ERR(__wt_config_collapse(session, cfg, &cgconf)); if ((ret = __wt_metadata_insert(session, name, cgconf)) != 0) { /* * If the entry already exists in the metadata, we're done. * This is an error for exclusive creates but okay otherwise. */ if (ret == WT_DUPLICATE_KEY) ret = exclusive ? EEXIST : 0; goto err; } WT_ERR(__wt_schema_open_colgroups(session, table)); err: __wt_free(session, cgconf); __wt_free(session, sourceconf); __wt_free(session, oldconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); __wt_schema_release_table(session, table); return (ret); }
/* * __snapshot_update -- * Update a snapshot. */ static int __snapshot_update( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap, WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live) { WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard)); #endif /* * Write the snapshot's extent lists; we only write an avail list for * the live system, other snapshot's avail lists are static and never * change. When we do write the avail list for the live system it's * two lists: the current avail list plus the list of blocks that are * being made available as of the new snapshot. We can't merge that * second list into the real list yet, it's not truly available until * the new snapshot location has been saved to the metadata. */ WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL)); if (is_live) WT_RET(__wt_block_extlist_write( session, block, &si->avail, &si->snapshot_avail)); WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL)); /* * Set the file size for the live system. * * XXX * We do NOT set the file size when re-writing snapshots because we want * to test the snapshot's blocks against a reasonable maximum file size * during verification. This is not good: imagine a snapshot appearing * early in the file, re-written, and then the snapshot requires blocks * at the end of the file, blocks after the listed file size. If the * application opens that snapshot for writing (discarding subsequent * snapshots), we would truncate the file to the early chunk, discarding * the re-written snapshot information. The alternative, updating the * file size has its own problems, in that case we'd work correctly, but * we'd lose all of the blocks between the original snapshot and the * re-written snapshot. Currently, there's no API to roll-forward * intermediate snapshots, if there ever is, this will need to be fixed. */ if (is_live) WT_RET(__wt_filesize(session, block->fh, &si->file_size)); /* Set the snapshot size for the live system. */ if (is_live) si->snapshot_size = snapshot_size; /* * Copy the snapshot information into the snapshot array's address * cookie. */ WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = snap->raw.mem; WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si)); snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem); if (WT_VERBOSE_ISSET(session, snapshot)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: create-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } err: __wt_scr_free(&tmp); return (ret); }
/* * __create_table -- * Create a table. */ static int __create_table(WT_SESSION_IMPL *session, const char *name, int exclusive, const char *config) { WT_CONFIG conf; WT_CONFIG_ITEM cgkey, cgval, cval; WT_DECL_RET; WT_TABLE *table; const char *cfg[4] = { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL }; const char *tablename; char *tableconf, *cgname; size_t cgsize; int ncolgroups; cgname = NULL; table = NULL; tableconf = NULL; tablename = name; if (!WT_PREFIX_SKIP(tablename, "table:")) return (EINVAL); if ((ret = __wt_schema_get_table(session, tablename, strlen(tablename), 0, &table)) == 0) { __wt_schema_release_table(session, table); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_ERR(__wt_config_gets(session, cfg, "colgroups", &cval)); WT_ERR(__wt_config_subinit(session, &conf, &cval)); for (ncolgroups = 0; (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0; ncolgroups++) ; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_config_collapse(session, cfg, &tableconf)); if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) { /* * If the entry already exists in the metadata, we're done. * This is an error for exclusive creates but okay otherwise. */ if (ret == WT_DUPLICATE_KEY) ret = exclusive ? EEXIST : 0; goto err; } /* Attempt to open the table now to catch any errors. */ WT_ERR(__wt_schema_get_table( session, tablename, strlen(tablename), 1, &table)); if (ncolgroups == 0) { cgsize = strlen("colgroup:") + strlen(tablename) + 1; WT_ERR(__wt_calloc_def(session, cgsize, &cgname)); snprintf(cgname, cgsize, "colgroup:%s", tablename); WT_ERR(__create_colgroup(session, cgname, exclusive, config)); } if (0) { err: if (table != NULL) { WT_TRET(__wt_schema_remove_table(session, table)); table = NULL; } } if (table != NULL) __wt_schema_release_table(session, table); __wt_free(session, cgname); __wt_free(session, tableconf); return (ret); }
/* * __wt_block_snapshot_load -- * Load a snapshot. */ int __wt_block_snapshot_load(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size, int readonly) { WT_BLOCK_SNAPSHOT *si; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_UNUSED(addr_size); /* * Sometimes we don't find a root page (we weren't given a snapshot, * or the referenced snapshot was empty). In that case we return a * root page size of 0. Set that up now. */ dsk->size = 0; si = &block->live; WT_RET(__wt_block_snap_init(session, block, si, "live", 1)); if (WT_VERBOSE_ISSET(session, snapshot)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string(session, block, addr, tmp)); } WT_VERBOSE_ERR(session, snapshot, "%s: load-snapshot: %s", block->name, addr == NULL ? "[Empty]" : (char *)tmp->data); } /* If not loading a snapshot from disk, we're done. */ if (addr == NULL || addr_size == 0) return (0); /* Crack the snapshot cookie. */ if (addr != NULL) WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_snap_load(session, block, si)); /* Read, and optionally verify, any root page. */ if (si->root_offset != WT_BLOCK_INVALID_OFFSET) { WT_ERR(__wt_block_read_off(session, block, dsk, si->root_offset, si->root_size, si->root_cksum)); if (block->verify) { if (tmp == NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string( session, block, addr, tmp)); } WT_ERR( __wt_verify_dsk(session, (char *)tmp->data, dsk)); } } /* * Rolling a snapshot forward requires the avail list, the blocks from * which we can allocate. */ if (!readonly) WT_ERR(__wt_block_extlist_read(session, block, &si->avail)); /* * If the snapshot can be written, that means anything written after * the snapshot is no longer interesting. Truncate the file. */ if (!readonly) { WT_VERBOSE_ERR(session, snapshot, "truncate file to %" PRIuMAX, (uintmax_t)si->file_size); WT_ERR(__wt_ftruncate(session, block->fh, si->file_size)); } if (0) { err: (void)__wt_block_snapshot_unload(session, block); } __wt_scr_free(&tmp); return (ret); }
/* * __wt_tree_walk -- * Move to the next/previous page in the tree. */ int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; bool empty_internal, prev, skip; uint32_t slot; btree = S2BT(session); empty_internal = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } ascend: /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __page_refp(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the page, return this page * in post-order traversal. Otherwise we move to the next/prev * slot and left/right-most element in its subtree. */ if ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { ref = ref->home->pg_intl_parent_ref; /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(ref->page); empty_internal = false; } /* Optionally skip internal pages. */ if (LF_ISSET(WT_READ_SKIP_INTL)) goto ascend; /* * We've ascended the tree and are returning an internal * page. If it's the root, discard our hazard pointer, * otherwise, swap our hazard pointer for the page we'll * return. */ if (__wt_ref_is_root(ref)) WT_ERR(__wt_page_release( session, couple, flags)); else { /* * Locate the reference to our parent page then * swap our child hazard pointer for the parent. * We don't handle restart or not-found returns. * It would require additional complexity and is * not a possible return: we're moving to the * parent of the current child page, our parent * reference can't have split or been evicted. */ __page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } } *refp = ref; goto done; } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * update those hints when splitting, so it's common for * them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } ret = __wt_page_swap(session, couple, ref, flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __page_refp(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ descend: couple = ref; page = ref->page; if (WT_PAGE_IS_INTERNAL(page)) { WT_INTL_INDEX_GET(session, page, pindex); slot = prev ? pindex->entries - 1 : 0; empty_internal = true; } else { *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __wt_txn_checkpoint_log -- * Write a log record for a checkpoint operation. */ int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) { WT_DECL_ITEM(logrec); WT_DECL_RET; WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; uint8_t *end, *p; size_t recsize; uint32_t i, rectype = WT_LOGREC_CHECKPOINT; const char *fmt = WT_UNCHECKED_STRING(IIQIU); txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; /* * If this is a file sync, log it unless there is a full checkpoint in * progress. */ if (!full) { if (txn->full_ckpt) { if (lsnp != NULL) *lsnp = *ckpt_lsn; return (0); } return (__txn_log_file_sync(session, flags, lsnp)); } switch (flags) { case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = 1; WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the * current log file exists. */ WT_ERR(__wt_log_force_sync(session, ckpt_lsn)); break; case WT_TXN_LOG_CKPT_START: /* Take a copy of the transaction snapshot. */ txn->ckpt_nsnapshot = txn->snapshot_count; recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); p = txn->ckpt_snapshot->mem; end = p + recsize; for (i = 0; i < txn->snapshot_count; i++) WT_ERR(__wt_vpack_uint( &p, WT_PTRDIFF(end, p), txn->snapshot[i])); break; case WT_TXN_LOG_CKPT_STOP: /* * During a clean connection close, we get here without the * prepare or start steps. In that case, log the current LSN * as the checkpoint LSN. */ if (!txn->full_ckpt) { txn->ckpt_nsnapshot = 0; WT_CLEAR(empty); ckpt_snapshot = ∅ WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); } else ckpt_snapshot = txn->ckpt_snapshot; /* Write the checkpoint log record. */ WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, txn->ckpt_nsnapshot, ckpt_snapshot)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, txn->ckpt_nsnapshot, ckpt_snapshot)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_log_write(session, logrec, lsnp, F_ISSET(S2C(session), WT_CONN_CKPT_SYNC) ? WT_LOG_FSYNC : 0)); /* * If this full checkpoint completed successfully and there is * no hot backup in progress, tell the logging subsystem the * checkpoint LSN so that it can archive. Do not update the * logging checkpoint LSN if this is during a clean connection * close, only during a full checkpoint. A clean close may not * update any metadata LSN and we do not want to archive in * that case. */ if (!S2C(session)->hot_backup && txn->full_ckpt) WT_ERR(__wt_log_ckpt(session, ckpt_lsn)); /* FALLTHROUGH */ case WT_TXN_LOG_CKPT_CLEANUP: /* Cleanup any allocated resources */ WT_INIT_LSN(ckpt_lsn); txn->ckpt_nsnapshot = 0; __wt_scr_free(session, &txn->ckpt_snapshot); txn->full_ckpt = 0; break; WT_ILLEGAL_VALUE_ERR(session); } err: __wt_logrec_free(session, &logrec); return (ret); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set; flush_set = false; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { __wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri); return (0); } /* Stop if a running transaction needs the chunk. */ WT_RET(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { __wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri); return (0); } if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; __wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * !!! * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { /* * Set read-uncommitted: we have already checked that all of the * updates in this chunk are globally visible, use the cheapest * possible check in reconciliation. */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_ERR(ret); __wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri); /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. * * Ensure that we don't race with a running checkpoint: the checkpoint * lock protects against us racing with an application checkpoint in * this chunk. Don't wait for it, though: checkpoints can take a long * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker( session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ __wt_epoch(session, &lsm_tree->last_flush_ts); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ __wt_lsm_tree_writelock(session, lsm_tree); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, true); __wt_lsm_tree_writeunlock(session, lsm_tree); if (ret != 0) WT_ERR_MSG(session, ret, "LSM metadata write"); WT_PUBLISH(chunk->flushing, 0); flush_set = false; /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, true); WT_ERR(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); __wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); return (ret); }
/* * __las_page_instantiate -- * Instantiate lookaside update records in a recently read page. */ static int __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) { WT_CURSOR *cursor; WT_CURSOR_BTREE cbt; WT_DECL_ITEM(current_key); WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_ITEM(las_value); WT_DECL_RET; WT_PAGE *page; WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; uint32_t las_id, upd_size, session_flags; int exact; const uint8_t *p; cursor = NULL; page = ref->page; first_upd = last_upd = upd = NULL; total_incr = 0; current_recno = recno = WT_RECNO_OOB; session_flags = 0; /* [-Werror=maybe-uninitialized] */ __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_value)); /* Open a lookaside table cursor. */ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * The lookaside records are in key and update order, that is, there * will be a set of in-order updates for a key, then another set of * in-order updates for a subsequent key. We process all of the updates * for a key and then insert those updates into the page, then all the * updates for the next key, and so on. * * Search for the block's unique prefix, stepping through any matching * records. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != read_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * If the on-page value has become globally visible, this record * is no longer needed. */ if (__wt_txn_visible_all(session, las_txnid)) continue; /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value( cursor, &upd_txnid, &upd_size, las_value)); WT_ERR(__wt_update_alloc(session, (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, &upd, &incr)); total_incr += incr; upd->txnid = upd_txnid; switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = las_key->data; WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); if (current_recno == recno) break; WT_ASSERT(session, current_recno < recno); if (first_upd != NULL) { WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; } current_recno = recno; break; case WT_PAGE_ROW_LEAF: if (current_key->size == las_key->size && memcmp(current_key->data, las_key->data, las_key->size) == 0) break; if (first_upd != NULL) { WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; } WT_ERR(__wt_buf_set(session, current_key, las_key->data, las_key->size)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Append the latest update to the list. */ if (first_upd == NULL) first_upd = last_upd = upd; else { last_upd->next = upd; last_upd = upd; } upd = NULL; } WT_ERR_NOTFOUND_OK(ret); /* Insert the last set of updates, if any. */ if (first_upd != NULL) switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; break; case WT_PAGE_ROW_LEAF: WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; break; WT_ILLEGAL_VALUE_ERR(session); } /* Discard the cursor. */ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); if (total_incr != 0) { __wt_cache_page_inmem_incr(session, page, total_incr); /* * We've modified/dirtied the page, but that's not necessary and * if we keep the page clean, it's easier to evict. We leave the * lookaside table updates in place, so if we evict this page * without dirtying it, any future instantiation of it will find * the records it needs. If the page is dirtied before eviction, * then we'll write any needed lookaside table records for the * new location of the page. */ __wt_page_modify_clear(session, page); } err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_TRET(__wt_btcur_close(&cbt, true)); /* * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ if (upd != NULL) __wt_free(session, upd); if (first_upd != NULL) __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); __wt_scr_free(session, &las_value); return (ret); }
/* * __wt_lsm_free_chunks -- * Try to drop chunks from the tree that are no longer required. */ int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i, skipped; int drop_ret; bool flush_metadata; flush_metadata = false; if (lsm_tree->nold_chunks == 0) return (0); /* * Make sure only a single thread is freeing the old chunk array * at any time. */ if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1)) return (0); /* * Take a copy of the current state of the LSM tree and look for chunks * to drop. We do it this way to avoid holding the LSM tree lock while * doing I/O or waiting on the schema lock. * * This is safe because only one thread will be in this function at a * time. Merges may complete concurrently, and the old_chunks array * may be extended, but we shuffle down the pointers each time we free * one to keep the non-NULL slots at the beginning of the array. */ WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, true)); for (i = skipped = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; WT_ASSERT(session, chunk != NULL); /* Skip the chunk if another worker is using it. */ if (chunk->refcnt > 1) { ++skipped; continue; } /* * Don't remove files if a hot backup is in progress. * * The schema lock protects the set of live files, this check * prevents us from removing a file that hot backup already * knows about. */ if (S2C(session)->hot_backup) break; /* * Drop any bloom filters and chunks we can. Don't try to drop * a chunk if the bloom filter drop fails. * An EBUSY return indicates that a cursor is still open in * the tree - move to the next chunk in that case. * An ENOENT return indicates that the LSM tree metadata was * out of sync with the on disk state. Update the * metadata to match in that case. */ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { drop_ret = __lsm_drop_file(session, chunk->bloom_uri); if (drop_ret == EBUSY) { ++skipped; continue; } else if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; F_CLR(chunk, WT_LSM_CHUNK_BLOOM); } if (chunk->uri != NULL) { drop_ret = __lsm_drop_file(session, chunk->uri); if (drop_ret == EBUSY) { ++skipped; continue; } else if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; } /* Lock the tree to clear out the old chunk information. */ __wt_lsm_tree_writelock(session, lsm_tree); /* * The chunk we are looking at should be the first one in the * tree that we haven't already skipped over. */ WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, lsm_tree->old_chunks[skipped]); /* Shuffle down to keep all occupied slots at the beginning. */ if (--lsm_tree->nold_chunks > skipped) { memmove(lsm_tree->old_chunks + skipped, lsm_tree->old_chunks + skipped + 1, (lsm_tree->nold_chunks - skipped) * sizeof(WT_LSM_CHUNK *)); lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL; } __wt_lsm_tree_writeunlock(session, lsm_tree); /* * Clear the chunk in the cookie so we don't attempt to * decrement the reference count. */ cookie.chunk_array[i] = NULL; } err: /* Flush the metadata unless the system is in panic */ if (flush_metadata && ret != WT_PANIC) { __wt_lsm_tree_writelock(session, lsm_tree); WT_TRET(__wt_lsm_meta_write(session, lsm_tree)); __wt_lsm_tree_writeunlock(session, lsm_tree); } __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); lsm_tree->freeing_old_chunks = 0; /* Returning non-zero means there is no work to do. */ if (!flush_metadata) WT_TRET(WT_NOTFOUND); return (ret); }
/* * __wt_las_remove_block -- * Remove all records matching a key prefix from the lookaside store. */ int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) { WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; uint64_t las_counter, las_txnid; int64_t remove_cnt; uint32_t las_id; int exact; remove_cnt = 0; WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); /* * Search for the block's unique prefix and step through all matching * records, removing them. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != btree_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ WT_ERR(cursor->remove(cursor)); ++remove_cnt; } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); /* * If there were races to remove records, we can over-count. All * arithmetic is signed, so underflow isn't fatal, but check anyway so * we don't skew low over time. */ if (remove_cnt > S2C(session)->las_record_cnt) S2C(session)->las_record_cnt = 0; else if (remove_cnt > 0) (void)__wt_atomic_subi64( &S2C(session)->las_record_cnt, remove_cnt); return (ret); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = { WT_CONFIG_BASE(session, session_create), config, NULL }; const char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "collator", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->collator_name)); WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); if (cval.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); if (cval.type == WT_CONFIG_ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); lsm_tree->chunk_max = (uint64_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); lsm_tree->chunk_size = (uint64_t)cval.val; if (lsm_tree->chunk_size > lsm_tree->chunk_max) WT_ERR_MSG(session, EINVAL, "Chunk size (chunk_size) must be smaller than or equal to " "the maximum chunk size (chunk_max)"); WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; lsm_tree->merge_min = lsm_tree->merge_max / 2; WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_threads", &cval)); lsm_tree->merge_threads = (uint32_t)cval.val; /* Sanity check that api_data.py is in sync with lsm.h */ WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS); /* * Set up the config for each chunk. If possible, avoid high latencies * from fsync by flushing the cache every 8MB (will be overridden by * any application setting). */ tmpconfig = ""; #ifdef HAVE_SYNC_FILE_RANGE if (!S2C(session)->direct_io) tmpconfig = "os_cache_dirty_max=8MB,"; #endif WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s%s,key_format=u,value_format=u", tmpconfig, config)); lsm_tree->file_config = __wt_buf_steal(session, buf, NULL); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __wt_txn_commit -- * Commit the current transaction. */ int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_OP *op; u_int i; bool locked, readonly; #ifdef HAVE_TIMESTAMPS wt_timestamp_t prev_commit_timestamp, ts; bool update_timestamp; #endif txn = &session->txn; conn = S2C(session); txn_global = &conn->txn_global; locked = false; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); readonly = txn->mod_count == 0; /* * Look for a commit timestamp. */ WT_ERR( __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval)); if (cval.len != 0) { #ifdef HAVE_TIMESTAMPS WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); WT_ERR(__wt_timestamp_validate(session, "commit", &ts, &cval, true, true, true)); __wt_timestamp_set(&txn->commit_timestamp, &ts); __wt_txn_set_commit_timestamp(session); #else WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif } #ifdef HAVE_TIMESTAMPS /* * Debugging checks on timestamps, if user requested them. */ if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "commit_timestamp required and " "none set on this transaction"); if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and " "timestamp set on this transaction"); #endif /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. */ WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); /* * If the user chose the default setting, check whether sync is enabled * for this transaction (either inherited or via begin_transaction). * If sync is disabled, clear the field to avoid the log write being * flushed. * * Otherwise check for specific settings. We don't need to check for * "on" because that is the default inherited from the connection. If * the user set anything in begin_transaction, we only override with an * explicit setting. */ if (cval.len == 0) { if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET)) txn->txn_logsync = 0; } else { /* * If the caller already set sync on begin_transaction then * they should not be using sync on commit_transaction. * Flag that as an error. */ if (F_ISSET(txn, WT_TXN_SYNC_SET)) WT_ERR_MSG(session, EINVAL, "Sync already set during begin_transaction"); if (WT_STRING_MATCH("background", cval.str, cval.len)) txn->txn_logsync = WT_LOG_BACKGROUND; else if (WT_STRING_MATCH("off", cval.str, cval.len)) txn->txn_logsync = 0; /* * We don't need to check for "on" here because that is the * default to inherit from the connection setting. */ } /* Commit notification. */ if (txn->notify != NULL) WT_ERR(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1)); /* * We are about to release the snapshot: copy values into any * positioned cursors so they don't point to updates that could be * freed once we don't have a snapshot. */ if (session->ncursors > 0) { WT_DIAGNOSTIC_YIELD; WT_ERR(__wt_session_copy_values(session)); } /* If we are logging, write a commit log record. */ if (txn->logrec != NULL && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && !F_ISSET(session, WT_SESSION_NO_LOGGING)) { /* * We are about to block on I/O writing the log. * Release our snapshot in case it is keeping data pinned. * This is particularly important for checkpoints. */ __wt_txn_release_snapshot(session); /* * We hold the visibility lock for reading from the time * we write our log record until the time we release our * transaction so that the LSN any checkpoint gets will * always reflect visible data. */ __wt_readlock(session, &txn_global->visibility_rwlock); locked = true; WT_ERR(__wt_txn_log_commit(session, cfg)); } /* Note: we're going to commit: nothing can fail after this point. */ /* Process and free updates. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { switch (op->type) { case WT_TXN_OP_BASIC: case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: /* * Switch reserved operations to abort to * simplify obsolete update list truncation. */ if (op->u.upd->type == WT_UPDATE_RESERVED) { op->u.upd->txnid = WT_TXN_ABORTED; break; } /* * Writes to the lookaside file can be evicted as soon * as they commit. */ if (conn->cache->las_fileid != 0 && op->fileid == conn->cache->las_fileid) { op->u.upd->txnid = WT_TXN_NONE; break; } #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && op->type != WT_TXN_OP_BASIC_TS) { WT_ASSERT(session, op->fileid != WT_METAFILE_ID); __wt_timestamp_set(&op->u.upd->timestamp, &txn->commit_timestamp); } #endif break; case WT_TXN_OP_REF: #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) __wt_timestamp_set( &op->u.ref->page_del->timestamp, &txn->commit_timestamp); #endif break; case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: /* Other operations don't need timestamps. */ break; } __wt_txn_op_free(session, op); } txn->mod_count = 0; #ifdef HAVE_TIMESTAMPS /* * Track the largest commit timestamp we have seen. * * We don't actually clear the local commit timestamp, just the flag. * That said, we can't update the global commit timestamp until this * transaction is visible, which happens when we release it. */ update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT); #endif __wt_txn_release(session); if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); #ifdef HAVE_TIMESTAMPS /* First check if we've already committed something in the future. */ if (update_timestamp) { WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp)); update_timestamp = __wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0; } /* * If it looks like we need to move the global commit timestamp, * write lock and re-check. */ if (update_timestamp) { #if WT_TIMESTAMP_SIZE == 8 while (__wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0) { if (__wt_atomic_cas64( &txn_global->commit_timestamp.val, prev_commit_timestamp.val, txn->commit_timestamp.val)) { txn_global->has_commit_timestamp = true; break; } __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp); } #else __wt_writelock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&txn->commit_timestamp, &txn_global->commit_timestamp) > 0) { __wt_timestamp_set(&txn_global->commit_timestamp, &txn->commit_timestamp); txn_global->has_commit_timestamp = true; } __wt_writeunlock(session, &txn_global->rwlock); #endif } #endif /* * We're between transactions, if we need to block for eviction, it's * a good time to do so. Note that we must ignore any error return * because the user's data is committed. */ if (!readonly) (void)__wt_cache_eviction_check(session, false, false, NULL); return (0); err: /* * If anything went wrong, roll back. * * !!! * Nothing can fail after this point. */ if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); }