/* * __curfile_close -- * WT_CURSOR->close method for the btree cursor type. */ static int __curfile_close(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, close, cbt->btree); WT_TRET(__wt_btcur_close(cbt)); if (session->btree != NULL) WT_TRET(__wt_session_release_btree(session)); /* The URI is owned by the btree handle. */ cursor->uri = NULL; WT_TRET(__wt_cursor_close(cursor)); API_END(session); return (ret); }
/* * __backup_cleanup_handles -- * Release and free all btree handles held by the backup. This is kept * separate from __backup_stop because it can be called without the * schema lock held. */ static int __backup_cleanup_handles(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) { WT_CURSOR_BACKUP_ENTRY *p; WT_DECL_RET; if (cb->list == NULL) return (0); /* Release the handles, free the file names, free the list itself. */ for (p = cb->list; p->name != NULL; ++p) { if (p->handle != NULL) WT_WITH_DHANDLE(session, p->handle, WT_TRET(__wt_session_release_btree(session))); __wt_free(session, p->name); } __wt_free(session, cb->list); return (ret); }
/* * __meta_btree_apply -- * Apply a function to all files listed in the metadata, apart from the * metadata file. */ static inline int __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) { WT_DECL_RET; const char *uri; int cmp; cursor->set_key(cursor, "file:"); if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_RET(cursor->get_key(cursor, &uri)); if (!WT_PREFIX_MATCH(uri, "file:")) break; if (strcmp(uri, WT_METAFILE_URI) == 0) continue; /* * We need to pull the handle into the session handle cache * and make sure it's referenced to stop other internal code * dropping the handle (e.g in LSM when cleaning up obsolete * chunks). Holding the metadata lock isn't enough. */ ret = __wt_session_get_btree(session, uri, NULL, NULL, 0); if (ret == 0) { WT_SAVE_DHANDLE(session, ret = func(session, cfg)); if (WT_META_TRACKING(session)) WT_TRET(__wt_meta_track_handle_lock( session, false)); else WT_TRET(__wt_session_release_btree(session)); } else if (ret == EBUSY) ret = __wt_conn_btree_apply_single( session, uri, NULL, func, cfg); WT_RET(ret); } WT_RET_NOTFOUND_OK(ret); return (0); }
/* * __wt_curfile_open -- * WT_SESSION->open_cursor method for the btree cursor type. */ int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CONFIG_ITEM cval; WT_DECL_RET; int bitmap, bulk; uint32_t flags; flags = 0; WT_RET(__wt_config_gets_defno(session, cfg, "bulk", &cval)); if (cval.type == ITEM_NUM && (cval.val == 0 || cval.val == 1)) { bitmap = 0; bulk = (cval.val != 0); } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len)) bitmap = bulk = 1; else WT_RET_MSG(session, EINVAL, "Value for 'bulk' must be a boolean or 'bitmap'"); /* Bulk handles require exclusive access. */ if (bulk) LF_SET(WT_BTREE_BULK | WT_BTREE_EXCLUSIVE); /* TODO: handle projections. */ /* Get the handle and lock it while the cursor is using it. */ if (WT_PREFIX_MATCH(uri, "file:")) WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, flags)); else WT_RET(__wt_bad_object_type(session, uri)); WT_ERR(__wt_curfile_create(session, owner, cfg, bulk, bitmap, cursorp)); return (0); err: /* If the cursor could not be opened, release the handle. */ WT_TRET(__wt_session_release_btree(session)); return (ret); }
/* * __meta_btree_apply -- * Apply a function to all files listed in the metadata, apart from the * metadata file. */ static inline int __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]) { WT_DECL_RET; const char *uri; bool skip; while ((ret = cursor->next(cursor)) == 0) { WT_RET(cursor->get_key(cursor, &uri)); if (strcmp(uri, WT_METAFILE_URI) == 0) continue; skip = false; if (name_func != NULL) WT_RET(name_func(session, uri, &skip)); if (file_func == NULL || skip || !WT_PREFIX_MATCH(uri, "file:")) continue; /* * We need to pull the handle into the session handle cache * and make sure it's referenced to stop other internal code * dropping the handle (e.g in LSM when cleaning up obsolete * chunks). Holding the metadata lock isn't enough. */ if ((ret = __wt_session_get_btree( session, uri, NULL, NULL, 0)) != 0) return (ret == EBUSY ? 0 : ret); WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); WT_TRET(__wt_session_release_btree(session)); WT_RET(ret); } WT_RET_NOTFOUND_OK(ret); return (0); }
/* * __meta_track_apply -- * Apply the changes in a metadata tracking record. */ static int __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; int tret; switch (trk->op) { case WT_ST_EMPTY: /* Unused slot */ break; case WT_ST_CHECKPOINT: /* Checkpoint, see above */ btree = trk->dhandle->handle; bm = btree->bm; WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(bm->checkpoint_resolve(bm, session))); break; case WT_ST_DROP_COMMIT: if ((tret = __wt_remove_if_exists(session, trk->a)) != 0) { __wt_err(session, tret, "metadata remove dropped file %s", trk->a); WT_TRET(tret); } break; case WT_ST_LOCK: WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(__wt_session_release_btree(session))); break; case WT_ST_FILEOP: case WT_ST_REMOVE: case WT_ST_SET: break; WT_ILLEGAL_VALUE(session); } __meta_track_clear(session, trk); return (ret); }
/* * __curbulk_close -- * WT_CURSOR->close for the bulk cursor type. */ static int __curbulk_close(WT_CURSOR *cursor) { WT_BTREE *btree; WT_CURSOR_BULK *cbulk; WT_SESSION_IMPL *session; int ret; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; CURSOR_API_CALL(cursor, session, close, btree); WT_TRET(__wt_bulk_end(cbulk)); if (session->btree != NULL) WT_TRET(__wt_session_release_btree(session)); /* The URI is owned by the btree handle. */ cursor->uri = NULL; WT_TRET(__wt_cursor_close(cursor)); API_END(session); return (ret); }
/* * __compact_handle_append -- * Gather a file handle to be compacted. * Called via the schema_worker function. */ static int __compact_handle_append(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; WT_UNUSED(cfg); WT_RET(__wt_session_get_btree( session, session->dhandle->name, NULL, NULL, 0)); /* Set compact active on the handle. */ if ((ret = __compact_start(session)) != 0) { WT_TRET(__wt_session_release_btree(session)); return (ret); } /* Make sure there is space for the next entry. */ WT_RET(__wt_realloc_def(session, &session->op_handle_allocated, session->op_handle_next + 1, &session->op_handle)); session->op_handle[session->op_handle_next++] = session->dhandle; return (0); }
/* * __wt_curfile_open -- * WT_SESSION->open_cursor method for the btree cursor type. */ int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CONFIG_ITEM cval; WT_DECL_RET; uint32_t flags; /* * Bulk and no cache handles are exclusive and may not be used by more * than a single thread. * Additionally set the discard flag on no cache handles so they are * destroyed on close. */ flags = 0; WT_RET(__wt_config_gets_defno(session, cfg, "bulk", &cval)); if (cval.val != 0) LF_SET(WT_BTREE_EXCLUSIVE | WT_BTREE_BULK); WT_RET(__wt_config_gets_defno(session, cfg, "no_cache", &cval)); if (cval.val != 0) LF_SET(WT_BTREE_EXCLUSIVE | WT_BTREE_NO_CACHE); /* TODO: handle projections. */ /* Get the handle and lock it while the cursor is using it. */ if (WT_PREFIX_MATCH(uri, "file:")) WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, flags)); else WT_RET(__wt_bad_object_type(session, uri)); WT_ERR(__wt_curfile_create(session, owner, cfg, cursorp)); return (0); err: /* If the cursor could not be opened, release the handle. */ (void)__wt_session_release_btree(session); return (ret); }
/* * __wt_lsm_checkpoint_worker -- * A worker thread for an LSM tree, responsible for flushing new chunks to * disk. */ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; u_int i, j; int locked; lsm_tree = arg; session = lsm_tree->ckpt_session; WT_CLEAR(cookie); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Write checkpoints in all completed files. */ for (i = 0, j = 0; i < cookie.nchunks - 1; i++) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) goto err; if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; chunk = cookie.chunk_array[i]; /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (!__wt_txn_visible_all(session, chunk->txnid_max)) break; /* * If the chunk is already checkpointed, make sure it * is also evicted. Either way, there is no point * trying to checkpoint it again. */ if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) { if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED)) continue; if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) F_SET_ATOMIC( chunk, WT_LSM_CHUNK_EVICTED); else if (ret == EBUSY) ret = 0; else WT_ERR_MSG(session, ret, "discard handle"); continue; } WT_VERBOSE_ERR(session, lsm, "LSM worker flushing %u", i); /* * Flush the file before checkpointing: this is the * expensive part in terms of I/O: do it without * holding the schema lock. * * Use the special eviction isolation level to avoid * interfering with an application checkpoint: we have * already checked that all of the updates in this * chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to * complete, which can be a long time. * * Don't keep waiting for the lock if application * threads are waiting for a switch. Don't skip * flushing the leaves either: that just means we'll * hold the schema lock for (much) longer, which blocks * the world. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); for (locked = 0; !locked && ret == 0 && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) { if ((ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; else if (ret == EBUSY) { __wt_yield(); ret = 0; } } if (locked) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_bt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); } WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointing %u", i); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint"); break; } WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* * Clear the "cache resident" flag so the primary can * be evicted and eventually closed. Only do this once * the checkpoint has succeeded: otherwise, accessing * the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_ERR(__wt_session_release_btree(session)); ++j; WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint metadata write"); break; } WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointed %u", i); } __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* * The thread will only exit with failure if we run out of memory or * there is some other system driven failure. We can't keep going * after such a failure - ensure WiredTiger shuts down. */ if (ret != 0 && ret != WT_NOTFOUND) WT_PANIC_ERR(session, ret, "Shutting down LSM checkpoint utility thread"); return (NULL); }
/* * __create_file -- * Create a new 'file:' object. */ static int __create_file(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_DECL_ITEM(val); WT_DECL_RET; uint32_t allocsize; int is_metadata; const char *filename, **p, *filecfg[] = { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL }; char *fileconf; fileconf = NULL; is_metadata = strcmp(uri, WT_METAFILE_URI) == 0; filename = uri; if (!WT_PREFIX_SKIP(filename, "file:")) WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri); /* Check if the file already exists. */ if (!is_metadata && (ret = __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) { if (exclusive) WT_TRET(EEXIST); goto err; } /* Sanity check the allocation size. */ WT_RET(__wt_direct_io_size_check( session, filecfg, "allocation_size", &allocsize)); /* Create the file. */ WT_ERR(__wt_block_manager_create(session, filename, allocsize)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); /* * If creating an ordinary file, append the file ID and current version * numbers to the passed-in configuration and insert the resulting * configuration into the metadata. */ if (!is_metadata) { WT_ERR(__wt_scr_alloc(session, 0, &val)); WT_ERR(__wt_buf_fmt(session, val, "id=%" PRIu32 ",version=(major=%d,minor=%d)", ++S2C(session)->next_file_id, WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); for (p = filecfg; *p != NULL; ++p) ; *p = val->data; WT_ERR(__wt_config_collapse(session, filecfg, &fileconf)); WT_ERR(__wt_metadata_insert(session, uri, fileconf)); } /* * Open the file to check that it was setup correctly. We don't need to * pass the configuration, we just wrote the collapsed configuration * into the metadata file, and it's going to be read/used by underlying * functions. * * Keep the handle exclusive until it is released at the end of the * call, otherwise we could race with a drop. */ WT_ERR(__wt_session_get_btree( session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_handle_lock(session, 1)); else WT_ERR(__wt_session_release_btree(session)); err: __wt_scr_free(session, &val); __wt_free(session, fileconf); return (ret); }
static int __find_next_col(WT_SESSION_IMPL *session, WT_TABLE *table, WT_CONFIG_ITEM *colname, int *cgnump, int *colnump, char *coltype) { WT_BTREE *cgtree; WT_CONFIG conf; WT_CONFIG_ITEM cval, k, v; WT_DECL_RET; int cg, col, foundcg, foundcol, getnext; foundcg = foundcol = -1; getnext = 1; for (cgtree = NULL, cg = 0; cg < WT_COLGROUPS(table); cg++) { WT_RET(__wt_schema_get_btree(session, table->cg_name[cg], strlen(table->cg_name[cg]), NULL, 0)); cgtree = session->btree; /* * If there is only one column group, we just scan through all * of the columns. For tables with multiple column groups, we * look at the key columns once, then go through the value * columns for each group. */ if (cg == 0) { cval = table->colconf; col = 0; } else { cgcols: WT_ERR(__wt_config_getones(session, cgtree->config, "columns", &cval)); col = table->nkey_columns; } WT_ERR(__wt_config_subinit(session, &conf, &cval)); for (; __wt_config_next(&conf, &k, &v) == 0; col++) { if (cg == *cgnump && col == *colnump) getnext = 1; if (getnext && k.len == colname->len && strncmp(colname->str, k.str, k.len) == 0) { foundcg = cg; foundcol = col; getnext = 0; } if (cg == 0 && table->ncolgroups > 0 && col == table->nkey_columns - 1) goto cgcols; } cgtree = NULL; WT_ERR(__wt_session_release_btree(session)); } err: if (cgtree != NULL) WT_TRET(__wt_session_release_btree(session)); WT_RET(ret); if (foundcg == -1) return (WT_NOTFOUND); *cgnump = foundcg; if (foundcol < table->nkey_columns) { *coltype = WT_PROJ_KEY; *colnump = foundcol; } else { *coltype = WT_PROJ_VALUE; *colnump = foundcol - table->nkey_columns; } return (0); }
/* * __meta_track_apply -- * Apply the changes in a metadata tracking record. */ static int __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; int tret; /* * Unlock handles and complete checkpoints regardless of whether we are * unrolling. */ if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK) goto free; switch (trk->op) { case WT_ST_EMPTY: /* Unused slot */ break; case WT_ST_CHECKPOINT: /* Checkpoint, see above */ if (!unroll) { btree = trk->dhandle->handle; bm = btree->bm; WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(bm->checkpoint_resolve(bm, session))); } break; case WT_ST_LOCK: /* Handle lock, see above */ if (unroll && trk->created) F_SET(trk->dhandle, WT_DHANDLE_DISCARD); WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(__wt_session_release_btree(session))); break; case WT_ST_FILEOP: /* File operation */ /* * For renames, both a and b are set. * For creates, a is NULL. * For removes, b is NULL. */ if (trk->a != NULL && trk->b != NULL && (tret = __wt_rename(session, trk->b + strlen("file:"), trk->a + strlen("file:"))) != 0) { __wt_err(session, tret, "metadata unroll rename %s to %s", trk->b, trk->a); WT_TRET(tret); } else if (trk->a == NULL) { if ((tret = __wt_remove(session, trk->b + strlen("file:"))) != 0) { __wt_err(session, tret, "metadata unroll create %s", trk->b); WT_TRET(tret); } } /* * We can't undo removes yet: that would imply * some kind of temporary rename and remove in * roll forward. */ break; case WT_ST_REMOVE: /* Remove trk.a */ if ((tret = __wt_metadata_remove(session, trk->a)) != 0) { __wt_err(session, tret, "metadata unroll remove: %s", trk->a); WT_TRET(tret); } break; case WT_ST_SET: /* Set trk.a to trk.b */ if ((tret = __wt_metadata_update( session, trk->a, trk->b)) != 0) { __wt_err(session, tret, "metadata unroll update %s to %s", trk->a, trk->b); WT_TRET(tret); } break; WT_ILLEGAL_VALUE(session); } free: trk->op = WT_ST_EMPTY; __wt_free(session, trk->a); __wt_free(session, trk->b); trk->dhandle = NULL; return (ret); }
/* * __wt_curfile_open -- * WT_SESSION->open_cursor method for the btree cursor type. */ int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CONFIG_ITEM cval; WT_DECL_RET; uint32_t flags; bool bitmap, bulk, checkpoint_wait; bitmap = bulk = false; checkpoint_wait = true; flags = 0; /* * Decode the bulk configuration settings. In memory databases * ignore bulk load. */ if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) { WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval)); if (cval.type == WT_CONFIG_ITEM_BOOL || (cval.type == WT_CONFIG_ITEM_NUM && (cval.val == 0 || cval.val == 1))) { bitmap = false; bulk = cval.val != 0; } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len)) bitmap = bulk = true; /* * Unordered bulk insert is a special case used * internally by index creation on existing tables. It * doesn't enforce any special semantics at the file * level. It primarily exists to avoid some locking * problems between LSM and index creation. */ else if (!WT_STRING_MATCH("unordered", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "Value for 'bulk' must be a boolean or 'bitmap'"); if (bulk) { WT_RET(__wt_config_gets(session, cfg, "checkpoint_wait", &cval)); checkpoint_wait = cval.val != 0; } } /* Bulk handles require exclusive access. */ if (bulk) LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE); /* Get the handle and lock it while the cursor is using it. */ if (WT_PREFIX_MATCH(uri, "file:")) { /* * If we are opening exclusive and don't want a bulk cursor * open to fail with EBUSY due to a database-wide checkpoint, * get the handle while holding the checkpoint lock. */ if (LF_ISSET(WT_DHANDLE_EXCLUSIVE) && checkpoint_wait) WT_WITH_CHECKPOINT_LOCK(session, ret = __wt_session_get_btree_ckpt( session, uri, cfg, flags)); else ret = __wt_session_get_btree_ckpt( session, uri, cfg, flags); WT_RET(ret); } else WT_RET(__wt_bad_object_type(session, uri)); WT_ERR(__curfile_create(session, owner, cfg, bulk, bitmap, cursorp)); return (0); err: /* If the cursor could not be opened, release the handle. */ WT_TRET(__wt_session_release_btree(session)); return (ret); }
/* * __meta_track_unroll -- * Undo the changes in a metadata tracking record. */ static int __meta_track_unroll(WT_SESSION_IMPL *session, WT_META_TRACK *trk) { WT_DECL_RET; int tret; switch (trk->op) { case WT_ST_EMPTY: /* Unused slot */ break; case WT_ST_CHECKPOINT: /* Checkpoint, see above */ break; case WT_ST_DROP_COMMIT: break; case WT_ST_LOCK: /* Handle lock, see above */ if (trk->created) F_SET(trk->dhandle, WT_DHANDLE_DISCARD); WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(__wt_session_release_btree(session))); break; case WT_ST_FILEOP: /* File operation */ /* * For renames, both a and b are set. * For creates, a is NULL. * For removes, b is NULL. */ if (trk->a != NULL && trk->b != NULL && (tret = __wt_rename(session, trk->b + strlen("file:"), trk->a + strlen("file:"))) != 0) { __wt_err(session, tret, "metadata unroll rename %s to %s", trk->b, trk->a); WT_TRET(tret); } else if (trk->a == NULL) { if ((tret = __wt_remove(session, trk->b + strlen("file:"))) != 0) { __wt_err(session, tret, "metadata unroll create %s", trk->b); WT_TRET(tret); } } /* * We can't undo removes yet: that would imply * some kind of temporary rename and remove in * roll forward. */ break; case WT_ST_REMOVE: /* Remove trk.a */ if ((tret = __wt_metadata_remove(session, trk->a)) != 0) { __wt_err(session, tret, "metadata unroll remove: %s", trk->a); WT_TRET(tret); } break; case WT_ST_SET: /* Set trk.a to trk.b */ if ((tret = __wt_metadata_update( session, trk->a, trk->b)) != 0) { __wt_err(session, tret, "metadata unroll update %s to %s", trk->a, trk->b); WT_TRET(tret); } break; WT_ILLEGAL_VALUE(session); } __meta_track_clear(session, trk); return (ret); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * Use the special eviction isolation level to avoid interfering with * an application checkpoint: we have already checked that all of the * updates in this chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to complete, which * can be a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_RET(ret); WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) WT_RET_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts)); /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, 1); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_RET_MSG(session, ret, "LSM metadata write"); /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_RET(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); return (0); }
/* * __wt_session_compact -- * WT_SESSION.compact method. */ int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config) { WT_COMPACT_STATE compact; WT_CONFIG_ITEM cval; WT_DATA_SOURCE *dsrc; WT_DECL_RET; WT_SESSION_IMPL *session; WT_TXN *txn; u_int i; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); /* In-memory is already as compact as it's going to get. */ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) goto err; /* * Non-LSM object compaction requires checkpoints, which are impossible * in transactional contexts. Disallow in all contexts (there's no * reason for LSM to allow this, possible or not), and check now so the * error message isn't confusing. */ txn = &session->txn; if (F_ISSET(txn, WT_TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, "compaction not permitted in a transaction"); /* Disallow objects in the WiredTiger name space. */ WT_ERR(__wt_str_name_check(session, uri)); if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "lsm:") && !WT_PREFIX_MATCH(uri, "table:")) { if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) ret = dsrc->compact == NULL ? __wt_object_unsupported(session, uri) : dsrc->compact( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg); else ret = __wt_bad_object_type(session, uri); goto err; } /* Setup the session handle's compaction state structure. */ memset(&compact, 0, sizeof(WT_COMPACT_STATE)); session->compact = &compact; /* Compaction can be time-limited. */ WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); session->compact->max_time = (uint64_t)cval.val; __wt_epoch(session, &session->compact->begin); /* Find the types of data sources being compacted. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __compact_handle_append, __compact_uri_analyze, cfg, 0)); WT_ERR(ret); if (session->compact->lsm_count != 0) WT_ERR(__wt_schema_worker( session, uri, NULL, __wt_lsm_compact, cfg, 0)); if (session->compact->file_count != 0) WT_ERR(__compact_worker(session)); err: session->compact = NULL; for (i = 0; i < session->op_handle_next; ++i) { WT_WITH_DHANDLE(session, session->op_handle[i], WT_TRET(__compact_end(session))); WT_WITH_DHANDLE(session, session->op_handle[i], WT_TRET(__wt_session_release_btree(session))); } __wt_free(session, session->op_handle); session->op_handle_allocated = session->op_handle_next = 0; /* * Release common session resources (for example, checkpoint may acquire * significant reconciliation structures/memory). */ WT_TRET(__wt_session_release_resources(session)); if (ret != 0) WT_STAT_CONN_INCR(session, session_table_compact_fail); else WT_STAT_CONN_INCR(session, session_table_compact_success); API_END_RET_NOTFOUND_MAP(session, ret); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set; flush_set = false; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session, true); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * !!! * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { /* * Set read-uncommitted: we have already checked that all of the * updates in this chunk are globally visible, use the cheapest * possible check in reconciliation. */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_ERR(ret); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. * * Ensure that we don't race with a running checkpoint: the checkpoint * lock protects against us racing with an application checkpoint in * this chunk. Don't wait for it, though: checkpoints can take a long * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker( session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, true); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM metadata write"); WT_PUBLISH(chunk->flushing, 0); flush_set = false; /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, true); WT_ERR(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); return (ret); }
/* * __wt_schema_worker -- * Get Btree handles for the object and cycle through calls to an * underlying worker function with each handle. */ int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *), const char *cfg[], uint32_t open_flags) { WT_COLGROUP *colgroup; WT_DATA_SOURCE *dsrc; WT_DECL_RET; WT_INDEX *idx; WT_SESSION *wt_session; WT_TABLE *table; const char *tablename; u_int i; table = NULL; tablename = uri; if (name_func != NULL) WT_ERR(name_func(session, uri)); /* Get the btree handle(s) and call the underlying function. */ if (WT_PREFIX_MATCH(uri, "file:")) { if (file_func != NULL) { WT_ERR(__wt_session_get_btree_ckpt( session, uri, cfg, open_flags)); ret = file_func(session, cfg); WT_TRET(__wt_session_release_btree(session)); } } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { WT_ERR(__wt_schema_get_colgroup(session, uri, NULL, &colgroup)); WT_ERR(__wt_schema_worker(session, colgroup->source, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_SKIP(tablename, "index:")) { idx = NULL; WT_ERR(__wt_schema_get_index(session, uri, NULL, &idx)); WT_ERR(__wt_schema_worker(session, idx->source, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_MATCH(uri, "lsm:")) { WT_ERR(__wt_lsm_tree_worker( session, uri, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_SKIP(tablename, "table:")) { WT_ERR(__wt_schema_get_table(session, tablename, strlen(tablename), 0, &table)); WT_ASSERT(session, session->dhandle == NULL); /* * We could make a recursive call for each colgroup or index * URI, but since we have already opened the table, we can take * a short cut and skip straight to the sources. If we have a * name function, it needs to know about the intermediate URIs. */ for (i = 0; i < WT_COLGROUPS(table); i++) { colgroup = table->cgroups[i]; if (name_func != NULL) WT_ERR(name_func(session, colgroup->name)); WT_ERR(__wt_schema_worker(session, colgroup->source, file_func, name_func, cfg, open_flags)); } WT_ERR(__wt_schema_open_indices(session, table)); for (i = 0; i < table->nindices; i++) { idx = table->indices[i]; if (name_func != NULL) WT_ERR(name_func(session, idx->name)); WT_ERR(__wt_schema_worker(session, idx->source, file_func, name_func, cfg, open_flags)); } } else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) { wt_session = (WT_SESSION *)session; if (file_func == __wt_compact && dsrc->compact != NULL) WT_ERR(dsrc->compact( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_salvage && dsrc->salvage != NULL) WT_ERR(dsrc->salvage( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_verify && dsrc->verify != NULL) WT_ERR(dsrc->verify( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else WT_ERR(__wt_object_unsupported(session, uri)); } else WT_ERR(__wt_bad_object_type(session, uri)); err: if (table != NULL) __wt_schema_release_table(session, table); return (ret); }